1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed/ceed.h> 18 #include <ceed/backend.h> 19 #include "ceed-magma.h" 20 21 #ifdef __cplusplus 22 CEED_INTERN "C" 23 #endif 24 int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, 25 CeedTransposeMode tmode, CeedEvalMode emode, 26 CeedVector U, CeedVector V) { 27 int ierr; 28 Ceed ceed; 29 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 30 CeedInt dim, ncomp, ndof; 31 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 32 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 33 ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr); 34 35 Ceed_Magma *data; 36 ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 37 38 const CeedScalar *u; 39 CeedScalar *v; 40 if (emode != CEED_EVAL_WEIGHT) { 41 ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u); CeedChkBackend(ierr); 42 } else if (emode != CEED_EVAL_WEIGHT) { 43 // LCOV_EXCL_START 44 return CeedError(ceed, CEED_ERROR_BACKEND, 45 "An input vector is required for this CeedEvalMode"); 46 // LCOV_EXCL_STOP 47 } 48 ierr = CeedVectorGetArray(V, CEED_MEM_DEVICE, &v); CeedChkBackend(ierr); 49 50 CeedBasis_Magma *impl; 51 ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 52 53 CeedInt P1d, Q1d; 54 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 55 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 56 57 CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d, comp = %d", 58 ncomp*CeedIntPow(P1d, dim), ncomp); 59 60 if (tmode == CEED_TRANSPOSE) { 61 CeedInt length; 62 ierr = CeedVectorGetLength(V, &length); CeedChkBackend(ierr); 63 magmablas_dlaset(MagmaFull, length, 1, 0., 0., v, length, data->queue); 64 ceed_magma_queue_sync( data->queue ); 65 } 66 switch (emode) { 67 case CEED_EVAL_INTERP: { 68 CeedInt P = P1d, Q = Q1d; 69 if (tmode == CEED_TRANSPOSE) { 70 P = Q1d; Q = P1d; 71 } 72 73 // Define element sizes for dofs/quad 74 CeedInt elquadsize = CeedIntPow(Q1d, dim); 75 CeedInt eldofssize = CeedIntPow(P1d, dim); 76 77 // E-vector ordering -------------- Q-vector ordering 78 // component component 79 // elem elem 80 // node node 81 82 // --- Define strides for NOTRANSPOSE mode: --- 83 // Input (u) is E-vector, output (v) is Q-vector 84 85 // Element strides 86 CeedInt u_elstride = eldofssize; 87 CeedInt v_elstride = elquadsize; 88 // Component strides 89 CeedInt u_compstride = nelem * eldofssize; 90 CeedInt v_compstride = nelem * elquadsize; 91 92 // --- Swap strides for TRANSPOSE mode: --- 93 if (tmode == CEED_TRANSPOSE) { 94 // Input (u) is Q-vector, output (v) is E-vector 95 // Element strides 96 v_elstride = eldofssize; 97 u_elstride = elquadsize; 98 // Component strides 99 v_compstride = nelem * eldofssize; 100 u_compstride = nelem * elquadsize; 101 } 102 103 ierr = magma_interp(P, Q, dim, ncomp, 104 impl->dinterp1d, tmode, 105 u, u_elstride, u_compstride, 106 v, v_elstride, v_compstride, 107 nelem, data->basis_kernel_mode, data->maxthreads, 108 data->queue); 109 if (ierr != 0) CeedError(ceed, CEED_ERROR_BACKEND, 110 "MAGMA: launch failure detected for magma_interp"); 111 } 112 break; 113 case CEED_EVAL_GRAD: { 114 CeedInt P = P1d, Q = Q1d; 115 // In CEED_NOTRANSPOSE mode: 116 // u is (P^dim x nc), column-major layout (nc = ncomp) 117 // v is (Q^dim x nc x dim), column-major layout (nc = ncomp) 118 // In CEED_TRANSPOSE mode, the sizes of u and v are switched. 119 if (tmode == CEED_TRANSPOSE) { 120 P = Q1d, Q = P1d; 121 } 122 123 // Define element sizes for dofs/quad 124 CeedInt elquadsize = CeedIntPow(Q1d, dim); 125 CeedInt eldofssize = CeedIntPow(P1d, dim); 126 127 // E-vector ordering -------------- Q-vector ordering 128 // dim 129 // component component 130 // elem elem 131 // node node 132 133 // --- Define strides for NOTRANSPOSE mode: --- 134 // Input (u) is E-vector, output (v) is Q-vector 135 136 // Element strides 137 CeedInt u_elstride = eldofssize; 138 CeedInt v_elstride = elquadsize; 139 // Component strides 140 CeedInt u_compstride = nelem * eldofssize; 141 CeedInt v_compstride = nelem * elquadsize; 142 // Dimension strides 143 CeedInt u_dimstride = 0; 144 CeedInt v_dimstride = nelem * elquadsize * ncomp; 145 146 // --- Swap strides for TRANSPOSE mode: --- 147 if (tmode == CEED_TRANSPOSE) { 148 // Input (u) is Q-vector, output (v) is E-vector 149 // Element strides 150 v_elstride = eldofssize; 151 u_elstride = elquadsize; 152 // Component strides 153 v_compstride = nelem * eldofssize; 154 u_compstride = nelem * elquadsize; 155 // Dimension strides 156 v_dimstride = 0; 157 u_dimstride = nelem * elquadsize * ncomp; 158 159 } 160 161 ierr = magma_grad( P, Q, dim, ncomp, 162 impl->dinterp1d, impl->dgrad1d, tmode, 163 u, u_elstride, u_compstride, u_dimstride, 164 v, v_elstride, v_compstride, v_dimstride, 165 nelem, data->basis_kernel_mode, data->maxthreads, 166 data->queue); 167 if (ierr != 0) CeedError(ceed, CEED_ERROR_BACKEND, 168 "MAGMA: launch failure detected for magma_grad"); 169 } 170 break; 171 case CEED_EVAL_WEIGHT: { 172 if (tmode == CEED_TRANSPOSE) 173 // LCOV_EXCL_START 174 return CeedError(ceed, CEED_ERROR_BACKEND, 175 "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 176 // LCOV_EXCL_STOP 177 CeedInt Q = Q1d; 178 int eldofssize = CeedIntPow(Q, dim); 179 ierr = magma_weight(Q, dim, impl->dqweight1d, v, eldofssize, nelem, 180 data->basis_kernel_mode, data->maxthreads, data->queue); 181 if (ierr != 0) CeedError(ceed, CEED_ERROR_BACKEND, 182 "MAGMA: launch failure detected for magma_weight"); 183 } 184 break; 185 // LCOV_EXCL_START 186 case CEED_EVAL_DIV: 187 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 188 case CEED_EVAL_CURL: 189 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 190 case CEED_EVAL_NONE: 191 return CeedError(ceed, CEED_ERROR_BACKEND, 192 "CEED_EVAL_NONE does not make sense in this context"); 193 // LCOV_EXCL_STOP 194 } 195 196 // must sync to ensure completeness 197 ceed_magma_queue_sync( data->queue ); 198 199 if (emode!=CEED_EVAL_WEIGHT) { 200 ierr = CeedVectorRestoreArrayRead(U, &u); CeedChkBackend(ierr); 201 } 202 ierr = CeedVectorRestoreArray(V, &v); CeedChkBackend(ierr); 203 return CEED_ERROR_SUCCESS; 204 } 205 206 #ifdef __cplusplus 207 CEED_INTERN "C" 208 #endif 209 int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt nelem, 210 CeedTransposeMode tmode, CeedEvalMode emode, 211 CeedVector U, CeedVector V) { 212 int ierr; 213 Ceed ceed; 214 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 215 216 Ceed_Magma *data; 217 ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 218 219 CeedInt dim, ncomp, ndof, nqpt; 220 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 221 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 222 ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr); 223 ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr); 224 const CeedScalar *du; 225 CeedScalar *dv; 226 if (emode != CEED_EVAL_WEIGHT) { 227 ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr); 228 } else if (emode != CEED_EVAL_WEIGHT) { 229 // LCOV_EXCL_START 230 return CeedError(ceed, CEED_ERROR_BACKEND, 231 "An input vector is required for this CeedEvalMode"); 232 // LCOV_EXCL_STOP 233 } 234 ierr = CeedVectorGetArray(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr); 235 236 CeedBasisNonTensor_Magma *impl; 237 ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 238 239 CeedDebug("\033[01m[CeedBasisApplyNonTensor_Magma] vsize=%d, comp = %d", 240 ncomp*ndof, ncomp); 241 242 if (tmode == CEED_TRANSPOSE) { 243 CeedInt length; 244 ierr = CeedVectorGetLength(V, &length); 245 magmablas_dlaset(MagmaFull, length, 1, 0., 0., dv, length, data->queue); 246 ceed_magma_queue_sync( data->queue ); 247 } 248 switch (emode) { 249 case CEED_EVAL_INTERP: { 250 CeedInt P = ndof, Q = nqpt; 251 if (tmode == CEED_TRANSPOSE) 252 magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, 253 P, nelem*ncomp, Q, 254 1.0, impl->dinterp, P, 255 du, Q, 256 0.0, dv, P, data->queue); 257 else 258 magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, 259 Q, nelem*ncomp, P, 260 1.0, impl->dinterp, P, 261 du, P, 262 0.0, dv, Q, data->queue); 263 } 264 break; 265 266 case CEED_EVAL_GRAD: { 267 CeedInt P = ndof, Q = nqpt; 268 if (tmode == CEED_TRANSPOSE) { 269 double beta = 0.0; 270 for(int d=0; d<dim; d++) { 271 if (d>0) 272 beta = 1.0; 273 magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, 274 P, nelem*ncomp, Q, 275 1.0, impl->dgrad + d*P*Q, P, 276 du + d*nelem*ncomp*Q, Q, 277 beta, dv, P, data->queue); 278 } 279 } else { 280 for(int d=0; d< dim; d++) 281 magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, 282 Q, nelem*ncomp, P, 283 1.0, impl->dgrad + d*P*Q, P, 284 du, P, 285 0.0, dv + d*nelem*ncomp*Q, Q, data->queue); 286 } 287 } 288 break; 289 290 case CEED_EVAL_WEIGHT: { 291 if (tmode == CEED_TRANSPOSE) 292 // LCOV_EXCL_START 293 return CeedError(ceed, CEED_ERROR_BACKEND, 294 "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 295 // LCOV_EXCL_STOP 296 297 int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1; 298 int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)? 299 1 : 0 ); 300 magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, 301 data->queue); 302 CeedChkBackend(ierr); 303 } 304 break; 305 306 // LCOV_EXCL_START 307 case CEED_EVAL_DIV: 308 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 309 case CEED_EVAL_CURL: 310 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 311 case CEED_EVAL_NONE: 312 return CeedError(ceed, CEED_ERROR_BACKEND, 313 "CEED_EVAL_NONE does not make sense in this context"); 314 // LCOV_EXCL_STOP 315 } 316 317 // must sync to ensure completeness 318 ceed_magma_queue_sync( data->queue ); 319 320 if (emode!=CEED_EVAL_WEIGHT) { 321 ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr); 322 } 323 ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr); 324 return CEED_ERROR_SUCCESS; 325 } 326 327 #ifdef __cplusplus 328 CEED_INTERN "C" 329 #endif 330 int CeedBasisDestroy_Magma(CeedBasis basis) { 331 int ierr; 332 CeedBasis_Magma *impl; 333 ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 334 335 ierr = magma_free(impl->dqref1d); CeedChkBackend(ierr); 336 ierr = magma_free(impl->dinterp1d); CeedChkBackend(ierr); 337 ierr = magma_free(impl->dgrad1d); CeedChkBackend(ierr); 338 ierr = magma_free(impl->dqweight1d); CeedChkBackend(ierr); 339 340 ierr = CeedFree(&impl); CeedChkBackend(ierr); 341 342 return CEED_ERROR_SUCCESS; 343 } 344 345 #ifdef __cplusplus 346 CEED_INTERN "C" 347 #endif 348 int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { 349 int ierr; 350 CeedBasisNonTensor_Magma *impl; 351 ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 352 353 ierr = magma_free(impl->dqref); CeedChkBackend(ierr); 354 ierr = magma_free(impl->dinterp); CeedChkBackend(ierr); 355 ierr = magma_free(impl->dgrad); CeedChkBackend(ierr); 356 ierr = magma_free(impl->dqweight); CeedChkBackend(ierr); 357 358 ierr = CeedFree(&impl); CeedChkBackend(ierr); 359 360 return CEED_ERROR_SUCCESS; 361 } 362 363 #ifdef __cplusplus 364 CEED_INTERN "C" 365 #endif 366 int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, 367 const CeedScalar *interp1d, 368 const CeedScalar *grad1d, 369 const CeedScalar *qref1d, 370 const CeedScalar *qweight1d, CeedBasis basis) { 371 int ierr; 372 CeedBasis_Magma *impl; 373 Ceed ceed; 374 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 375 376 // Check for supported parameters 377 CeedInt ncomp = 0; 378 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 379 if (ncomp > 3) 380 // LCOV_EXCL_START 381 return CeedError(ceed, CEED_ERROR_BACKEND, 382 "Magma backend does not support tensor bases with more than 3 components"); 383 // LCOV_EXCL_STOP 384 if (P1d > 10) 385 // LCOV_EXCL_START 386 return CeedError(ceed, CEED_ERROR_BACKEND, 387 "Magma backend does not support tensor bases with more than 10 nodes in each dimension"); 388 // LCOV_EXCL_STOP 389 if (Q1d > 10) 390 // LCOV_EXCL_START 391 return CeedError(ceed, CEED_ERROR_BACKEND, 392 "Magma backend does not support tensor bases with more than 10 quadrature points in each dimension"); 393 // LCOV_EXCL_STOP 394 395 Ceed_Magma *data; 396 ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 397 398 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 399 CeedBasisApply_Magma); CeedChkBackend(ierr); 400 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 401 CeedBasisDestroy_Magma); CeedChkBackend(ierr); 402 403 ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr); 404 ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr); 405 406 // Copy qref1d to the GPU 407 ierr = magma_malloc((void **)&impl->dqref1d, Q1d*sizeof(qref1d[0])); 408 CeedChkBackend(ierr); 409 magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1, 410 data->queue); 411 412 // Copy interp1d to the GPU 413 ierr = magma_malloc((void **)&impl->dinterp1d, Q1d*P1d*sizeof(interp1d[0])); 414 CeedChkBackend(ierr); 415 magma_setvector(Q1d*P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1, 416 data->queue); 417 418 // Copy grad1d to the GPU 419 ierr = magma_malloc((void **)&impl->dgrad1d, Q1d*P1d*sizeof(grad1d[0])); 420 CeedChkBackend(ierr); 421 magma_setvector(Q1d*P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1, 422 data->queue); 423 424 // Copy qweight1d to the GPU 425 ierr = magma_malloc((void **)&impl->dqweight1d, Q1d*sizeof(qweight1d[0])); 426 CeedChkBackend(ierr); 427 magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1, 428 data->queue); 429 430 return CEED_ERROR_SUCCESS; 431 } 432 433 #ifdef __cplusplus 434 CEED_INTERN "C" 435 #endif 436 int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, 437 CeedInt nqpts, const CeedScalar *interp, 438 const CeedScalar *grad, const CeedScalar *qref, 439 const CeedScalar *qweight, CeedBasis basis) { 440 int ierr; 441 CeedBasisNonTensor_Magma *impl; 442 Ceed ceed; 443 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 444 445 Ceed_Magma *data; 446 ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 447 448 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 449 CeedBasisApplyNonTensor_Magma); CeedChkBackend(ierr); 450 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 451 CeedBasisDestroyNonTensor_Magma); CeedChkBackend(ierr); 452 453 ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr); 454 ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr); 455 456 // Copy qref to the GPU 457 ierr = magma_malloc((void **)&impl->dqref, nqpts*sizeof(qref[0])); 458 CeedChkBackend(ierr); 459 magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue); 460 461 // Copy interp to the GPU 462 ierr = magma_malloc((void **)&impl->dinterp, nqpts*ndof*sizeof(interp[0])); 463 CeedChkBackend(ierr); 464 magma_setvector(nqpts*ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1, 465 data->queue); 466 467 // Copy grad to the GPU 468 ierr = magma_malloc((void **)&impl->dgrad, nqpts*ndof*dim*sizeof(grad[0])); 469 CeedChkBackend(ierr); 470 magma_setvector(nqpts*ndof*dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1, 471 data->queue); 472 473 // Copy qweight to the GPU 474 ierr = magma_malloc((void **)&impl->dqweight, nqpts*sizeof(qweight[0])); 475 CeedChkBackend(ierr); 476 magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1, 477 data->queue); 478 479 return CEED_ERROR_SUCCESS; 480 } 481