1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed.h> 18 #include <ceed-backend.h> 19 #include "ceed-magma.h" 20 21 #ifdef __cplusplus 22 CEED_INTERN "C" 23 #endif 24 int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, 25 CeedTransposeMode tmode, CeedEvalMode emode, 26 CeedVector U, CeedVector V) { 27 int ierr; 28 Ceed ceed; 29 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 30 CeedInt dim, ncomp, ndof; 31 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 32 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 33 ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr); 34 35 Ceed_Magma *data; 36 ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 37 38 const CeedScalar *u; 39 CeedScalar *v; 40 if (emode != CEED_EVAL_WEIGHT) { 41 ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u); CeedChkBackend(ierr); 42 } else if (emode != CEED_EVAL_WEIGHT) { 43 // LCOV_EXCL_START 44 return CeedError(ceed, CEED_ERROR_BACKEND, 45 "An input vector is required for this CeedEvalMode"); 46 // LCOV_EXCL_STOP 47 } 48 ierr = CeedVectorGetArray(V, CEED_MEM_DEVICE, &v); CeedChkBackend(ierr); 49 50 CeedBasis_Magma *impl; 51 ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 52 53 CeedInt P1d, Q1d; 54 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 55 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 56 57 CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d, comp = %d", 58 ncomp*CeedIntPow(P1d, dim), ncomp); 59 60 if (tmode == CEED_TRANSPOSE) { 61 CeedInt length; 62 ierr = CeedVectorGetLength(V, &length); CeedChkBackend(ierr); 63 magmablas_dlaset(MagmaFull, length, 1, 0., 0., v, length, data->queue); 64 ceed_magma_queue_sync( data->queue ); 65 } 66 switch (emode) { 67 case CEED_EVAL_INTERP: { 68 CeedInt P = P1d, Q = Q1d; 69 if (tmode == CEED_TRANSPOSE) { 70 P = Q1d; Q = P1d; 71 } 72 73 // Define element sizes for dofs/quad 74 CeedInt elquadsize = CeedIntPow(Q1d, dim); 75 CeedInt eldofssize = CeedIntPow(P1d, dim); 76 77 // E-vector ordering -------------- Q-vector ordering 78 // component component 79 // elem elem 80 // node node 81 82 // --- Define strides for NOTRANSPOSE mode: --- 83 // Input (u) is E-vector, output (v) is Q-vector 84 85 // Element strides 86 CeedInt u_elstride = eldofssize; 87 CeedInt v_elstride = elquadsize; 88 // Component strides 89 CeedInt u_compstride = nelem * eldofssize; 90 CeedInt v_compstride = nelem * elquadsize; 91 92 // --- Swap strides for TRANSPOSE mode: --- 93 if (tmode == CEED_TRANSPOSE) { 94 // Input (u) is Q-vector, output (v) is E-vector 95 // Element strides 96 v_elstride = eldofssize; 97 u_elstride = elquadsize; 98 // Component strides 99 v_compstride = nelem * eldofssize; 100 u_compstride = nelem * elquadsize; 101 } 102 103 ierr = magma_interp(P, Q, dim, ncomp, 104 impl->dinterp1d, tmode, 105 u, u_elstride, u_compstride, 106 v, v_elstride, v_compstride, 107 nelem, data->basis_kernel_mode, data->maxthreads, 108 data->queue); 109 if (ierr != 0) CeedError(ceed, CEED_ERROR_BACKEND, 110 "MAGMA: launch failure detected for magma_interp"); 111 } 112 break; 113 case CEED_EVAL_GRAD: { 114 CeedInt P = P1d, Q = Q1d; 115 // In CEED_NOTRANSPOSE mode: 116 // u is (P^dim x nc), column-major layout (nc = ncomp) 117 // v is (Q^dim x nc x dim), column-major layout (nc = ncomp) 118 // In CEED_TRANSPOSE mode, the sizes of u and v are switched. 119 if (tmode == CEED_TRANSPOSE) { 120 P = Q1d, Q = P1d; 121 } 122 123 // Define element sizes for dofs/quad 124 CeedInt elquadsize = CeedIntPow(Q1d, dim); 125 CeedInt eldofssize = CeedIntPow(P1d, dim); 126 127 // E-vector ordering -------------- Q-vector ordering 128 // dim 129 // component component 130 // elem elem 131 // node node 132 133 134 // --- Define strides for NOTRANSPOSE mode: --- 135 // Input (u) is E-vector, output (v) is Q-vector 136 137 // Element strides 138 CeedInt u_elstride = eldofssize; 139 CeedInt v_elstride = elquadsize; 140 // Component strides 141 CeedInt u_compstride = nelem * eldofssize; 142 CeedInt v_compstride = nelem * elquadsize; 143 // Dimension strides 144 CeedInt u_dimstride = 0; 145 CeedInt v_dimstride = nelem * elquadsize * ncomp; 146 147 // --- Swap strides for TRANSPOSE mode: --- 148 if (tmode == CEED_TRANSPOSE) { 149 // Input (u) is Q-vector, output (v) is E-vector 150 // Element strides 151 v_elstride = eldofssize; 152 u_elstride = elquadsize; 153 // Component strides 154 v_compstride = nelem * eldofssize; 155 u_compstride = nelem * elquadsize; 156 // Dimension strides 157 v_dimstride = 0; 158 u_dimstride = nelem * elquadsize * ncomp; 159 160 } 161 162 ierr = magma_grad( P, Q, dim, ncomp, 163 impl->dinterp1d, impl->dgrad1d, tmode, 164 u, u_elstride, u_compstride, u_dimstride, 165 v, v_elstride, v_compstride, v_dimstride, 166 nelem, data->basis_kernel_mode, data->maxthreads, 167 data->queue); 168 if (ierr != 0) CeedError(ceed, CEED_ERROR_BACKEND, 169 "MAGMA: launch failure detected for magma_grad"); 170 } 171 break; 172 case CEED_EVAL_WEIGHT: { 173 if (tmode == CEED_TRANSPOSE) 174 // LCOV_EXCL_START 175 return CeedError(ceed, CEED_ERROR_BACKEND, 176 "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 177 // LCOV_EXCL_STOP 178 CeedInt Q = Q1d; 179 int eldofssize = CeedIntPow(Q, dim); 180 ierr = magma_weight(Q, dim, impl->dqweight1d, v, eldofssize, nelem, 181 data->basis_kernel_mode, data->maxthreads, data->queue); 182 if (ierr != 0) CeedError(ceed, CEED_ERROR_BACKEND, 183 "MAGMA: launch failure detected for magma_weight"); 184 } 185 break; 186 // LCOV_EXCL_START 187 case CEED_EVAL_DIV: 188 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 189 case CEED_EVAL_CURL: 190 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 191 case CEED_EVAL_NONE: 192 return CeedError(ceed, CEED_ERROR_BACKEND, 193 "CEED_EVAL_NONE does not make sense in this context"); 194 // LCOV_EXCL_STOP 195 } 196 197 // must sync to ensure completeness 198 ceed_magma_queue_sync( data->queue ); 199 200 if (emode!=CEED_EVAL_WEIGHT) { 201 ierr = CeedVectorRestoreArrayRead(U, &u); CeedChkBackend(ierr); 202 } 203 ierr = CeedVectorRestoreArray(V, &v); CeedChkBackend(ierr); 204 return CEED_ERROR_SUCCESS; 205 } 206 207 #ifdef __cplusplus 208 CEED_INTERN "C" 209 #endif 210 int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt nelem, 211 CeedTransposeMode tmode, CeedEvalMode emode, 212 CeedVector U, CeedVector V) { 213 int ierr; 214 Ceed ceed; 215 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 216 217 Ceed_Magma *data; 218 ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 219 220 CeedInt dim, ncomp, ndof, nqpt; 221 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 222 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 223 ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr); 224 ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr); 225 const CeedScalar *du; 226 CeedScalar *dv; 227 if (emode != CEED_EVAL_WEIGHT) { 228 ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr); 229 } else if (emode != CEED_EVAL_WEIGHT) { 230 // LCOV_EXCL_START 231 return CeedError(ceed, CEED_ERROR_BACKEND, 232 "An input vector is required for this CeedEvalMode"); 233 // LCOV_EXCL_STOP 234 } 235 ierr = CeedVectorGetArray(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr); 236 237 CeedBasisNonTensor_Magma *impl; 238 ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 239 240 CeedDebug("\033[01m[CeedBasisApplyNonTensor_Magma] vsize=%d, comp = %d", 241 ncomp*ndof, ncomp); 242 243 if (tmode == CEED_TRANSPOSE) { 244 CeedInt length; 245 ierr = CeedVectorGetLength(V, &length); 246 magmablas_dlaset(MagmaFull, length, 1, 0., 0., dv, length, data->queue); 247 ceed_magma_queue_sync( data->queue ); 248 } 249 switch (emode) { 250 case CEED_EVAL_INTERP: { 251 CeedInt P = ndof, Q = nqpt; 252 if (tmode == CEED_TRANSPOSE) 253 magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, 254 P, nelem*ncomp, Q, 255 1.0, impl->dinterp, P, 256 du, Q, 257 0.0, dv, P, data->queue); 258 else 259 magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, 260 Q, nelem*ncomp, P, 261 1.0, impl->dinterp, P, 262 du, P, 263 0.0, dv, Q, data->queue); 264 } 265 break; 266 267 case CEED_EVAL_GRAD: { 268 CeedInt P = ndof, Q = nqpt; 269 if (tmode == CEED_TRANSPOSE) { 270 double beta = 0.0; 271 for(int d=0; d<dim; d++) { 272 if (d>0) 273 beta = 1.0; 274 magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, 275 P, nelem*ncomp, Q, 276 1.0, impl->dgrad + d*P*Q, P, 277 du + d*nelem*ncomp*Q, Q, 278 beta, dv, P, data->queue); 279 } 280 } else { 281 for(int d=0; d< dim; d++) 282 magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, 283 Q, nelem*ncomp, P, 284 1.0, impl->dgrad + d*P*Q, P, 285 du, P, 286 0.0, dv + d*nelem*ncomp*Q, Q, data->queue); 287 } 288 } 289 break; 290 291 case CEED_EVAL_WEIGHT: { 292 if (tmode == CEED_TRANSPOSE) 293 // LCOV_EXCL_START 294 return CeedError(ceed, CEED_ERROR_BACKEND, 295 "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 296 // LCOV_EXCL_STOP 297 298 int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1; 299 int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)? 300 1 : 0 ); 301 magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, 302 data->queue); 303 CeedChkBackend(ierr); 304 } 305 break; 306 307 // LCOV_EXCL_START 308 case CEED_EVAL_DIV: 309 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 310 case CEED_EVAL_CURL: 311 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 312 case CEED_EVAL_NONE: 313 return CeedError(ceed, CEED_ERROR_BACKEND, 314 "CEED_EVAL_NONE does not make sense in this context"); 315 // LCOV_EXCL_STOP 316 } 317 318 // must sync to ensure completeness 319 ceed_magma_queue_sync( data->queue ); 320 321 if (emode!=CEED_EVAL_WEIGHT) { 322 ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr); 323 } 324 ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr); 325 return CEED_ERROR_SUCCESS; 326 } 327 328 #ifdef __cplusplus 329 CEED_INTERN "C" 330 #endif 331 int CeedBasisDestroy_Magma(CeedBasis basis) { 332 int ierr; 333 CeedBasis_Magma *impl; 334 ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 335 336 ierr = magma_free(impl->dqref1d); CeedChkBackend(ierr); 337 ierr = magma_free(impl->dinterp1d); CeedChkBackend(ierr); 338 ierr = magma_free(impl->dgrad1d); CeedChkBackend(ierr); 339 ierr = magma_free(impl->dqweight1d); CeedChkBackend(ierr); 340 341 ierr = CeedFree(&impl); CeedChkBackend(ierr); 342 343 return CEED_ERROR_SUCCESS; 344 } 345 346 #ifdef __cplusplus 347 CEED_INTERN "C" 348 #endif 349 int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { 350 int ierr; 351 CeedBasisNonTensor_Magma *impl; 352 ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 353 354 ierr = magma_free(impl->dqref); CeedChkBackend(ierr); 355 ierr = magma_free(impl->dinterp); CeedChkBackend(ierr); 356 ierr = magma_free(impl->dgrad); CeedChkBackend(ierr); 357 ierr = magma_free(impl->dqweight); CeedChkBackend(ierr); 358 359 ierr = CeedFree(&impl); CeedChkBackend(ierr); 360 361 return CEED_ERROR_SUCCESS; 362 } 363 364 #ifdef __cplusplus 365 CEED_INTERN "C" 366 #endif 367 int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, 368 const CeedScalar *interp1d, 369 const CeedScalar *grad1d, 370 const CeedScalar *qref1d, 371 const CeedScalar *qweight1d, CeedBasis basis) { 372 int ierr; 373 CeedBasis_Magma *impl; 374 Ceed ceed; 375 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 376 377 // Check for supported parameters 378 CeedInt ncomp = 0; 379 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 380 if (ncomp > 3) 381 // LCOV_EXCL_START 382 return CeedError(ceed, CEED_ERROR_BACKEND, 383 "Magma backend does not support tensor bases with more than 3 components"); 384 // LCOV_EXCL_STOP 385 if (P1d > 10) 386 // LCOV_EXCL_START 387 return CeedError(ceed, CEED_ERROR_BACKEND, 388 "Magma backend does not support tensor bases with more than 10 nodes in each dimension"); 389 // LCOV_EXCL_STOP 390 if (Q1d > 10) 391 // LCOV_EXCL_START 392 return CeedError(ceed, CEED_ERROR_BACKEND, 393 "Magma backend does not support tensor bases with more than 10 quadrature points in each dimension"); 394 // LCOV_EXCL_STOP 395 396 Ceed_Magma *data; 397 ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 398 399 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 400 CeedBasisApply_Magma); CeedChkBackend(ierr); 401 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 402 CeedBasisDestroy_Magma); CeedChkBackend(ierr); 403 404 ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr); 405 ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr); 406 407 // Copy qref1d to the GPU 408 ierr = magma_malloc((void **)&impl->dqref1d, Q1d*sizeof(qref1d[0])); 409 CeedChkBackend(ierr); 410 magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1, 411 data->queue); 412 413 // Copy interp1d to the GPU 414 ierr = magma_malloc((void **)&impl->dinterp1d, Q1d*P1d*sizeof(interp1d[0])); 415 CeedChkBackend(ierr); 416 magma_setvector(Q1d*P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1, 417 data->queue); 418 419 // Copy grad1d to the GPU 420 ierr = magma_malloc((void **)&impl->dgrad1d, Q1d*P1d*sizeof(grad1d[0])); 421 CeedChkBackend(ierr); 422 magma_setvector(Q1d*P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1, 423 data->queue); 424 425 // Copy qweight1d to the GPU 426 ierr = magma_malloc((void **)&impl->dqweight1d, Q1d*sizeof(qweight1d[0])); 427 CeedChkBackend(ierr); 428 magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1, 429 data->queue); 430 431 return CEED_ERROR_SUCCESS; 432 } 433 434 #ifdef __cplusplus 435 CEED_INTERN "C" 436 #endif 437 int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, 438 CeedInt nqpts, const CeedScalar *interp, 439 const CeedScalar *grad, const CeedScalar *qref, 440 const CeedScalar *qweight, CeedBasis basis) { 441 int ierr; 442 CeedBasisNonTensor_Magma *impl; 443 Ceed ceed; 444 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 445 446 Ceed_Magma *data; 447 ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 448 449 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 450 CeedBasisApplyNonTensor_Magma); CeedChkBackend(ierr); 451 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 452 CeedBasisDestroyNonTensor_Magma); CeedChkBackend(ierr); 453 454 ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr); 455 ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr); 456 457 // Copy qref to the GPU 458 ierr = magma_malloc((void **)&impl->dqref, nqpts*sizeof(qref[0])); 459 CeedChkBackend(ierr); 460 magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue); 461 462 // Copy interp to the GPU 463 ierr = magma_malloc((void **)&impl->dinterp, nqpts*ndof*sizeof(interp[0])); 464 CeedChkBackend(ierr); 465 magma_setvector(nqpts*ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1, 466 data->queue); 467 468 // Copy grad to the GPU 469 ierr = magma_malloc((void **)&impl->dgrad, nqpts*ndof*dim*sizeof(grad[0])); 470 CeedChkBackend(ierr); 471 magma_setvector(nqpts*ndof*dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1, 472 data->queue); 473 474 // Copy qweight to the GPU 475 ierr = magma_malloc((void **)&impl->dqweight, nqpts*sizeof(qweight[0])); 476 CeedChkBackend(ierr); 477 magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1, 478 data->queue); 479 480 return CEED_ERROR_SUCCESS; 481 } 482