1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include "ceed-magma.h" 18 19 #ifdef __cplusplus 20 CEED_INTERN "C" 21 #endif 22 int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, 23 CeedTransposeMode tmode, CeedEvalMode emode, 24 CeedVector U, CeedVector V) { 25 int ierr; 26 Ceed ceed; 27 ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); 28 CeedInt dim, ncomp, ndof; 29 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 30 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChk(ierr); 31 ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChk(ierr); 32 33 Ceed_Magma *data; 34 ierr = CeedGetData(ceed, (void *)&data); CeedChk(ierr); 35 36 const CeedScalar *u; 37 CeedScalar *v; 38 if (emode != CEED_EVAL_WEIGHT) { 39 ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u); CeedChk(ierr); 40 } else if (emode != CEED_EVAL_WEIGHT) { 41 // LCOV_EXCL_START 42 return CeedError(ceed, 1, 43 "An input vector is required for this CeedEvalMode"); 44 // LCOV_EXCL_STOP 45 } 46 ierr = CeedVectorGetArray(V, CEED_MEM_DEVICE, &v); CeedChk(ierr); 47 48 CeedBasis_Magma *impl; 49 ierr = CeedBasisGetData(basis, (void *)&impl); CeedChk(ierr); 50 51 CeedInt P1d, Q1d; 52 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 53 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 54 55 CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d, comp = %d", 56 ncomp*CeedIntPow(P1d, dim), ncomp); 57 58 if (tmode == CEED_TRANSPOSE) { 59 CeedInt length; 60 ierr = CeedVectorGetLength(V, &length); CeedChk(ierr); 61 magmablas_dlaset(MagmaFull, length, 1, 0., 0., v, length, data->queue); 62 ceed_magma_queue_sync( data->queue ); 63 } 64 switch (emode) { 65 case CEED_EVAL_INTERP: { 66 CeedInt P = P1d, Q = Q1d; 67 if (tmode == CEED_TRANSPOSE) { 68 P = Q1d; Q = P1d; 69 } 70 71 // Define element sizes for dofs/quad 72 CeedInt elquadsize = CeedIntPow(Q1d, dim); 73 CeedInt eldofssize = CeedIntPow(P1d, dim); 74 75 // E-vector ordering -------------- Q-vector ordering 76 // component component 77 // elem elem 78 // node node 79 80 // --- Define strides for NOTRANSPOSE mode: --- 81 // Input (u) is E-vector, output (v) is Q-vector 82 83 // Element strides 84 CeedInt u_elstride = eldofssize; 85 CeedInt v_elstride = elquadsize; 86 // Component strides 87 CeedInt u_compstride = nelem * eldofssize; 88 CeedInt v_compstride = nelem * elquadsize; 89 90 // --- Swap strides for TRANSPOSE mode: --- 91 if (tmode == CEED_TRANSPOSE) { 92 // Input (u) is Q-vector, output (v) is E-vector 93 // Element strides 94 v_elstride = eldofssize; 95 u_elstride = elquadsize; 96 // Component strides 97 v_compstride = nelem * eldofssize; 98 u_compstride = nelem * elquadsize; 99 } 100 101 ierr = magma_interp(P, Q, dim, ncomp, 102 impl->dinterp1d, tmode, 103 u, u_elstride, u_compstride, 104 v, v_elstride, v_compstride, 105 nelem, data->basis_kernel_mode, data->maxthreads, 106 data->queue); 107 if (ierr != 0) CeedError(ceed, 1, 108 "MAGMA: launch failure detected for magma_interp"); 109 } 110 break; 111 case CEED_EVAL_GRAD: { 112 CeedInt P = P1d, Q = Q1d; 113 // In CEED_NOTRANSPOSE mode: 114 // u is (P^dim x nc), column-major layout (nc = ncomp) 115 // v is (Q^dim x nc x dim), column-major layout (nc = ncomp) 116 // In CEED_TRANSPOSE mode, the sizes of u and v are switched. 117 if (tmode == CEED_TRANSPOSE) { 118 P = Q1d, Q = P1d; 119 } 120 121 // Define element sizes for dofs/quad 122 CeedInt elquadsize = CeedIntPow(Q1d, dim); 123 CeedInt eldofssize = CeedIntPow(P1d, dim); 124 125 // E-vector ordering -------------- Q-vector ordering 126 // dim 127 // component component 128 // elem elem 129 // node node 130 131 132 // --- Define strides for NOTRANSPOSE mode: --- 133 // Input (u) is E-vector, output (v) is Q-vector 134 135 // Element strides 136 CeedInt u_elstride = eldofssize; 137 CeedInt v_elstride = elquadsize; 138 // Component strides 139 CeedInt u_compstride = nelem * eldofssize; 140 CeedInt v_compstride = nelem * elquadsize; 141 // Dimension strides 142 CeedInt u_dimstride = 0; 143 CeedInt v_dimstride = nelem * elquadsize * ncomp; 144 145 // --- Swap strides for TRANSPOSE mode: --- 146 if (tmode == CEED_TRANSPOSE) { 147 // Input (u) is Q-vector, output (v) is E-vector 148 // Element strides 149 v_elstride = eldofssize; 150 u_elstride = elquadsize; 151 // Component strides 152 v_compstride = nelem * eldofssize; 153 u_compstride = nelem * elquadsize; 154 // Dimension strides 155 v_dimstride = 0; 156 u_dimstride = nelem * elquadsize * ncomp; 157 158 } 159 160 ierr = magma_grad( P, Q, dim, ncomp, 161 impl->dinterp1d, impl->dgrad1d, tmode, 162 u, u_elstride, u_compstride, u_dimstride, 163 v, v_elstride, v_compstride, v_dimstride, 164 nelem, data->basis_kernel_mode, data->maxthreads, 165 data->queue); 166 if (ierr != 0) CeedError(ceed, 1, 167 "MAGMA: launch failure detected for magma_grad"); 168 } 169 break; 170 case CEED_EVAL_WEIGHT: { 171 if (tmode == CEED_TRANSPOSE) 172 // LCOV_EXCL_START 173 return CeedError(ceed, 1, 174 "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 175 // LCOV_EXCL_STOP 176 CeedInt Q = Q1d; 177 int eldofssize = CeedIntPow(Q, dim); 178 ierr = magma_weight(Q, dim, impl->dqweight1d, v, eldofssize, nelem, 179 data->basis_kernel_mode, data->maxthreads, data->queue); 180 if (ierr != 0) CeedError(ceed, 1, 181 "MAGMA: launch failure detected for magma_weight"); 182 } 183 break; 184 // LCOV_EXCL_START 185 case CEED_EVAL_DIV: 186 return CeedError(ceed, 1, "CEED_EVAL_DIV not supported"); 187 case CEED_EVAL_CURL: 188 return CeedError(ceed, 1, "CEED_EVAL_CURL not supported"); 189 case CEED_EVAL_NONE: 190 return CeedError(ceed, 1, 191 "CEED_EVAL_NONE does not make sense in this context"); 192 // LCOV_EXCL_STOP 193 } 194 195 // must sync to ensure completeness 196 ceed_magma_queue_sync( data->queue ); 197 198 if (emode!=CEED_EVAL_WEIGHT) { 199 ierr = CeedVectorRestoreArrayRead(U, &u); CeedChk(ierr); 200 } 201 ierr = CeedVectorRestoreArray(V, &v); CeedChk(ierr); 202 return 0; 203 } 204 205 #ifdef __cplusplus 206 CEED_INTERN "C" 207 #endif 208 int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt nelem, 209 CeedTransposeMode tmode, CeedEvalMode emode, 210 CeedVector U, CeedVector V) { 211 int ierr; 212 Ceed ceed; 213 ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); 214 215 Ceed_Magma *data; 216 ierr = CeedGetData(ceed, (void *)&data); CeedChk(ierr); 217 218 CeedInt dim, ncomp, ndof, nqpt; 219 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 220 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChk(ierr); 221 ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChk(ierr); 222 ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChk(ierr); 223 const CeedScalar *du; 224 CeedScalar *dv; 225 if (emode != CEED_EVAL_WEIGHT) { 226 ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChk(ierr); 227 } else if (emode != CEED_EVAL_WEIGHT) { 228 // LCOV_EXCL_START 229 return CeedError(ceed, 1, 230 "An input vector is required for this CeedEvalMode"); 231 // LCOV_EXCL_STOP 232 } 233 ierr = CeedVectorGetArray(V, CEED_MEM_DEVICE, &dv); CeedChk(ierr); 234 235 CeedBasisNonTensor_Magma *impl; 236 ierr = CeedBasisGetData(basis, (void *)&impl); CeedChk(ierr); 237 238 CeedDebug("\033[01m[CeedBasisApplyNonTensor_Magma] vsize=%d, comp = %d", 239 ncomp*ndof, ncomp); 240 241 if (tmode == CEED_TRANSPOSE) { 242 CeedInt length; 243 ierr = CeedVectorGetLength(V, &length); 244 magmablas_dlaset(MagmaFull, length, 1, 0., 0., dv, length, data->queue); 245 ceed_magma_queue_sync( data->queue ); 246 } 247 switch (emode) { 248 case CEED_EVAL_INTERP: { 249 CeedInt P = ndof, Q = nqpt; 250 if (tmode == CEED_TRANSPOSE) 251 magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, 252 P, nelem*ncomp, Q, 253 1.0, impl->dinterp, P, 254 du, Q, 255 0.0, dv, P, data->queue); 256 else 257 magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, 258 Q, nelem*ncomp, P, 259 1.0, impl->dinterp, P, 260 du, P, 261 0.0, dv, Q, data->queue); 262 } 263 break; 264 265 case CEED_EVAL_GRAD: { 266 CeedInt P = ndof, Q = nqpt; 267 if (tmode == CEED_TRANSPOSE) { 268 double beta = 0.0; 269 for(int d=0; d<dim; d++) { 270 if (d>0) 271 beta = 1.0; 272 magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, 273 P, nelem*ncomp, Q, 274 1.0, impl->dgrad + d*P*Q, P, 275 du + d*nelem*ncomp*Q, Q, 276 beta, dv, P, data->queue); 277 } 278 } else { 279 for(int d=0; d< dim; d++) 280 magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, 281 Q, nelem*ncomp, P, 282 1.0, impl->dgrad + d*P*Q, P, 283 du, P, 284 0.0, dv + d*nelem*ncomp*Q, Q, data->queue); 285 } 286 } 287 break; 288 289 case CEED_EVAL_WEIGHT: { 290 if (tmode == CEED_TRANSPOSE) 291 // LCOV_EXCL_START 292 return CeedError(ceed, 1, 293 "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 294 // LCOV_EXCL_STOP 295 296 int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1; 297 int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)? 298 1 : 0 ); 299 magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, 300 data->queue); 301 CeedChk(ierr); 302 } 303 break; 304 305 // LCOV_EXCL_START 306 case CEED_EVAL_DIV: 307 return CeedError(ceed, 1, "CEED_EVAL_DIV not supported"); 308 case CEED_EVAL_CURL: 309 return CeedError(ceed, 1, "CEED_EVAL_CURL not supported"); 310 case CEED_EVAL_NONE: 311 return CeedError(ceed, 1, 312 "CEED_EVAL_NONE does not make sense in this context"); 313 // LCOV_EXCL_STOP 314 } 315 316 // must sync to ensure completeness 317 ceed_magma_queue_sync( data->queue ); 318 319 if (emode!=CEED_EVAL_WEIGHT) { 320 ierr = CeedVectorRestoreArrayRead(U, &du); CeedChk(ierr); 321 } 322 ierr = CeedVectorRestoreArray(V, &dv); CeedChk(ierr); 323 return 0; 324 } 325 326 #ifdef __cplusplus 327 CEED_INTERN "C" 328 #endif 329 int CeedBasisDestroy_Magma(CeedBasis basis) { 330 int ierr; 331 CeedBasis_Magma *impl; 332 ierr = CeedBasisGetData(basis, (void *)&impl); CeedChk(ierr); 333 334 ierr = magma_free(impl->dqref1d); CeedChk(ierr); 335 ierr = magma_free(impl->dinterp1d); CeedChk(ierr); 336 ierr = magma_free(impl->dgrad1d); CeedChk(ierr); 337 ierr = magma_free(impl->dqweight1d); CeedChk(ierr); 338 339 ierr = CeedFree(&impl); CeedChk(ierr); 340 341 return 0; 342 } 343 344 #ifdef __cplusplus 345 CEED_INTERN "C" 346 #endif 347 int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { 348 int ierr; 349 CeedBasisNonTensor_Magma *impl; 350 ierr = CeedBasisGetData(basis, (void *)&impl); CeedChk(ierr); 351 352 ierr = magma_free(impl->dqref); CeedChk(ierr); 353 ierr = magma_free(impl->dinterp); CeedChk(ierr); 354 ierr = magma_free(impl->dgrad); CeedChk(ierr); 355 ierr = magma_free(impl->dqweight); CeedChk(ierr); 356 357 ierr = CeedFree(&impl); CeedChk(ierr); 358 359 return 0; 360 } 361 362 #ifdef __cplusplus 363 CEED_INTERN "C" 364 #endif 365 int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, 366 const CeedScalar *interp1d, 367 const CeedScalar *grad1d, 368 const CeedScalar *qref1d, 369 const CeedScalar *qweight1d, CeedBasis basis) { 370 int ierr; 371 CeedBasis_Magma *impl; 372 Ceed ceed; 373 ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); 374 375 Ceed_Magma *data; 376 ierr = CeedGetData(ceed, (void *)&data); CeedChk(ierr); 377 378 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 379 CeedBasisApply_Magma); CeedChk(ierr); 380 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 381 CeedBasisDestroy_Magma); CeedChk(ierr); 382 383 ierr = CeedCalloc(1,&impl); CeedChk(ierr); 384 ierr = CeedBasisSetData(basis, (void *)&impl); CeedChk(ierr); 385 386 // Copy qref1d to the GPU 387 ierr = magma_malloc((void **)&impl->dqref1d, Q1d*sizeof(qref1d[0])); 388 CeedChk(ierr); 389 magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1, 390 data->queue); 391 392 // Copy interp1d to the GPU 393 ierr = magma_malloc((void **)&impl->dinterp1d, Q1d*P1d*sizeof(interp1d[0])); 394 CeedChk(ierr); 395 magma_setvector(Q1d*P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1, 396 data->queue); 397 398 // Copy grad1d to the GPU 399 ierr = magma_malloc((void **)&impl->dgrad1d, Q1d*P1d*sizeof(grad1d[0])); 400 CeedChk(ierr); 401 magma_setvector(Q1d*P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1, 402 data->queue); 403 404 // Copy qweight1d to the GPU 405 ierr = magma_malloc((void **)&impl->dqweight1d, Q1d*sizeof(qweight1d[0])); 406 CeedChk(ierr); 407 magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1, 408 data->queue); 409 410 return 0; 411 } 412 413 #ifdef __cplusplus 414 CEED_INTERN "C" 415 #endif 416 int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, 417 CeedInt nqpts, const CeedScalar *interp, 418 const CeedScalar *grad, const CeedScalar *qref, 419 const CeedScalar *qweight, CeedBasis basis) { 420 int ierr; 421 CeedBasisNonTensor_Magma *impl; 422 Ceed ceed; 423 ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); 424 425 Ceed_Magma *data; 426 ierr = CeedGetData(ceed, (void *)&data); CeedChk(ierr); 427 428 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 429 CeedBasisApplyNonTensor_Magma); CeedChk(ierr); 430 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 431 CeedBasisDestroyNonTensor_Magma); CeedChk(ierr); 432 433 ierr = CeedCalloc(1,&impl); CeedChk(ierr); 434 ierr = CeedBasisSetData(basis, (void *)&impl); CeedChk(ierr); 435 436 // Copy qref to the GPU 437 ierr = magma_malloc((void **)&impl->dqref, nqpts*sizeof(qref[0])); 438 CeedChk(ierr); 439 magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue); 440 441 // Copy interp to the GPU 442 ierr = magma_malloc((void **)&impl->dinterp, nqpts*ndof*sizeof(interp[0])); 443 CeedChk(ierr); 444 magma_setvector(nqpts*ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1, 445 data->queue); 446 447 // Copy grad to the GPU 448 ierr = magma_malloc((void **)&impl->dgrad, nqpts*ndof*dim*sizeof(grad[0])); 449 CeedChk(ierr); 450 magma_setvector(nqpts*ndof*dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1, 451 data->queue); 452 453 // Copy qweight to the GPU 454 ierr = magma_malloc((void **)&impl->dqweight, nqpts*sizeof(qweight[0])); 455 CeedChk(ierr); 456 magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1, 457 data->queue); 458 459 return 0; 460 } 461