1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed/ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <string.h> 12 #include "ceed-magma.h" 13 #ifdef CEED_MAGMA_USE_HIP 14 #include "../hip/ceed-hip-common.h" 15 #include "../hip/ceed-hip-compile.h" 16 #else 17 #include "../cuda/ceed-cuda-common.h" 18 #include "../cuda/ceed-cuda-compile.h" 19 #endif 20 21 #ifdef __cplusplus 22 CEED_INTERN "C" 23 #endif 24 int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, 25 CeedTransposeMode tmode, CeedEvalMode emode, 26 CeedVector U, CeedVector V) { 27 int ierr; 28 Ceed ceed; 29 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 30 CeedInt dim, ncomp, ndof; 31 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 32 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 33 ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr); 34 35 Ceed_Magma *data; 36 ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 37 38 const CeedScalar *u; 39 CeedScalar *v; 40 if (emode != CEED_EVAL_WEIGHT) { 41 ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u); CeedChkBackend(ierr); 42 } else if (emode != CEED_EVAL_WEIGHT) { 43 // LCOV_EXCL_START 44 return CeedError(ceed, CEED_ERROR_BACKEND, 45 "An input vector is required for this CeedEvalMode"); 46 // LCOV_EXCL_STOP 47 } 48 ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &v); CeedChkBackend(ierr); 49 50 CeedBasis_Magma *impl; 51 ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 52 53 CeedInt P1d, Q1d; 54 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 55 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 56 57 CeedDebug(ceed, "\033[01m[CeedBasisApply_Magma] vsize=%" CeedInt_FMT 58 ", comp = %" CeedInt_FMT, ncomp*CeedIntPow(P1d, dim), ncomp); 59 60 if (tmode == CEED_TRANSPOSE) { 61 CeedSize length; 62 ierr = CeedVectorGetLength(V, &length); CeedChkBackend(ierr); 63 if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 64 magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) v, length, 65 data->queue); 66 } else { 67 magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) v, length, 68 data->queue); 69 } 70 ceed_magma_queue_sync( data->queue ); 71 } 72 73 switch (emode) { 74 case CEED_EVAL_INTERP: { 75 CeedInt P = P1d, Q = Q1d; 76 if (tmode == CEED_TRANSPOSE) { 77 P = Q1d; Q = P1d; 78 } 79 80 // Define element sizes for dofs/quad 81 CeedInt elquadsize = CeedIntPow(Q1d, dim); 82 CeedInt eldofssize = CeedIntPow(P1d, dim); 83 84 // E-vector ordering -------------- Q-vector ordering 85 // component component 86 // elem elem 87 // node node 88 89 // --- Define strides for NOTRANSPOSE mode: --- 90 // Input (u) is E-vector, output (v) is Q-vector 91 92 // Element strides 93 CeedInt u_elstride = eldofssize; 94 CeedInt v_elstride = elquadsize; 95 // Component strides 96 CeedInt u_compstride = nelem * eldofssize; 97 CeedInt v_compstride = nelem * elquadsize; 98 99 // --- Swap strides for TRANSPOSE mode: --- 100 if (tmode == CEED_TRANSPOSE) { 101 // Input (u) is Q-vector, output (v) is E-vector 102 // Element strides 103 v_elstride = eldofssize; 104 u_elstride = elquadsize; 105 // Component strides 106 v_compstride = nelem * eldofssize; 107 u_compstride = nelem * elquadsize; 108 } 109 110 CeedInt nthreads = 1; 111 CeedInt ntcol = 1; 112 CeedInt shmem = 0; 113 CeedInt maxPQ = CeedIntMax(P, Q); 114 115 switch (dim) { 116 case 1: 117 nthreads = maxPQ; 118 ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); 119 shmem += sizeof(CeedScalar) * ntcol * ( ncomp * (1*P + 1*Q) ); 120 shmem += sizeof(CeedScalar) * (P*Q); 121 break; 122 case 2: 123 nthreads = maxPQ; 124 ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); 125 shmem += P*Q *sizeof(CeedScalar); // for sT 126 shmem += ntcol * ( P*maxPQ*sizeof( 127 CeedScalar) ); // for reforming rU we need PxP, and for the intermediate output we need PxQ 128 break; 129 case 3: 130 nthreads = maxPQ*maxPQ; 131 ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); 132 shmem += sizeof(CeedScalar)* (P*Q); // for sT 133 shmem += sizeof(CeedScalar)* ntcol * (CeedIntMax(P*P*maxPQ, 134 P*Q*Q)); // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2) 135 } 136 CeedInt grid = (nelem + ntcol-1) / ntcol; 137 void *args[] = {&impl->dinterp1d, 138 &u, &u_elstride, &u_compstride, 139 &v, &v_elstride, &v_compstride, 140 &nelem 141 }; 142 143 if (tmode == CEED_TRANSPOSE) { 144 ierr = CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid, 145 nthreads, ntcol, 1, shmem, 146 args); CeedChkBackend(ierr); 147 } else { 148 ierr = CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid, 149 nthreads, ntcol, 1, shmem, 150 args); CeedChkBackend(ierr); 151 } 152 } 153 break; 154 case CEED_EVAL_GRAD: { 155 CeedInt P = P1d, Q = Q1d; 156 // In CEED_NOTRANSPOSE mode: 157 // u is (P^dim x nc), column-major layout (nc = ncomp) 158 // v is (Q^dim x nc x dim), column-major layout (nc = ncomp) 159 // In CEED_TRANSPOSE mode, the sizes of u and v are switched. 160 if (tmode == CEED_TRANSPOSE) { 161 P = Q1d, Q = P1d; 162 } 163 164 // Define element sizes for dofs/quad 165 CeedInt elquadsize = CeedIntPow(Q1d, dim); 166 CeedInt eldofssize = CeedIntPow(P1d, dim); 167 168 // E-vector ordering -------------- Q-vector ordering 169 // dim 170 // component component 171 // elem elem 172 // node node 173 174 // --- Define strides for NOTRANSPOSE mode: --- 175 // Input (u) is E-vector, output (v) is Q-vector 176 177 // Element strides 178 CeedInt u_elstride = eldofssize; 179 CeedInt v_elstride = elquadsize; 180 // Component strides 181 CeedInt u_compstride = nelem * eldofssize; 182 CeedInt v_compstride = nelem * elquadsize; 183 // Dimension strides 184 CeedInt u_dimstride = 0; 185 CeedInt v_dimstride = nelem * elquadsize * ncomp; 186 187 // --- Swap strides for TRANSPOSE mode: --- 188 if (tmode == CEED_TRANSPOSE) { 189 // Input (u) is Q-vector, output (v) is E-vector 190 // Element strides 191 v_elstride = eldofssize; 192 u_elstride = elquadsize; 193 // Component strides 194 v_compstride = nelem * eldofssize; 195 u_compstride = nelem * elquadsize; 196 // Dimension strides 197 v_dimstride = 0; 198 u_dimstride = nelem * elquadsize * ncomp; 199 200 } 201 202 CeedInt nthreads = 1; 203 CeedInt ntcol = 1; 204 CeedInt shmem = 0; 205 CeedInt maxPQ = CeedIntMax(P, Q); 206 207 switch (dim) { 208 case 1: 209 nthreads = maxPQ; 210 ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); 211 shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1*P + 1*Q)); 212 shmem += sizeof(CeedScalar) * (P*Q); 213 break; 214 case 2: 215 nthreads = maxPQ; 216 ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); 217 shmem += sizeof(CeedScalar) * 2*P*Q; // for sTinterp and sTgrad 218 shmem += sizeof(CeedScalar) * ntcol * 219 (P*maxPQ); // for reforming rU we need PxP, and for the intermediate output we need PxQ 220 break; 221 case 3: 222 nthreads = maxPQ * maxPQ; 223 ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); 224 shmem += sizeof(CeedScalar) * 2*P*Q; // for sTinterp and sTgrad 225 shmem += sizeof(CeedScalar) * ntcol * CeedIntMax(P*P*P, 226 (P*P*Q) + 227 (P*Q*Q)); // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2) 228 } 229 CeedInt grid = (nelem + ntcol-1) / ntcol; 230 void *args[] = {&impl->dinterp1d, &impl->dgrad1d, 231 &u, &u_elstride, &u_compstride, &u_dimstride, 232 &v, &v_elstride, &v_compstride, &v_dimstride, 233 &nelem 234 }; 235 236 if (tmode == CEED_TRANSPOSE) { 237 ierr = CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid, 238 nthreads, ntcol, 1, shmem, 239 args); CeedChkBackend(ierr); 240 } else { 241 ierr = CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid, 242 nthreads, ntcol, 1, shmem, 243 args); CeedChkBackend(ierr); 244 } 245 } 246 break; 247 case CEED_EVAL_WEIGHT: { 248 if (tmode == CEED_TRANSPOSE) 249 // LCOV_EXCL_START 250 return CeedError(ceed, CEED_ERROR_BACKEND, 251 "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 252 // LCOV_EXCL_STOP 253 CeedInt Q = Q1d; 254 CeedInt eldofssize = CeedIntPow(Q, dim); 255 CeedInt nthreads = 1; 256 CeedInt ntcol = 1; 257 CeedInt shmem = 0; 258 259 switch (dim) { 260 case 1: 261 nthreads = Q; 262 ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); 263 shmem += sizeof(CeedScalar) * Q; // for dqweight1d 264 shmem += sizeof(CeedScalar) * ntcol * Q; // for output 265 break; 266 case 2: 267 nthreads = Q; 268 ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); 269 shmem += sizeof(CeedScalar) * Q; // for dqweight1d 270 break; 271 case 3: 272 nthreads = Q * Q; 273 ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); 274 shmem += sizeof(CeedScalar) * Q; // for dqweight1d 275 } 276 CeedInt grid = (nelem + ntcol-1) / ntcol; 277 void *args[] = {&impl->dqweight1d, &v, &eldofssize, &nelem}; 278 279 ierr = CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid, 280 nthreads, ntcol, 1, shmem, 281 args); CeedChkBackend(ierr); 282 } 283 break; 284 // LCOV_EXCL_START 285 case CEED_EVAL_DIV: 286 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 287 case CEED_EVAL_CURL: 288 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 289 case CEED_EVAL_NONE: 290 return CeedError(ceed, CEED_ERROR_BACKEND, 291 "CEED_EVAL_NONE does not make sense in this context"); 292 // LCOV_EXCL_STOP 293 } 294 295 // must sync to ensure completeness 296 ceed_magma_queue_sync( data->queue ); 297 298 if (emode!=CEED_EVAL_WEIGHT) { 299 ierr = CeedVectorRestoreArrayRead(U, &u); CeedChkBackend(ierr); 300 } 301 ierr = CeedVectorRestoreArray(V, &v); CeedChkBackend(ierr); 302 return CEED_ERROR_SUCCESS; 303 } 304 305 #ifdef __cplusplus 306 CEED_INTERN "C" 307 #endif 308 int CeedBasisApplyNonTensor_f64_Magma(CeedBasis basis, CeedInt nelem, 309 CeedTransposeMode tmode, CeedEvalMode emode, 310 CeedVector U, CeedVector V) { 311 int ierr; 312 Ceed ceed; 313 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 314 315 Ceed_Magma *data; 316 ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 317 318 CeedInt dim, ncomp, ndof, nqpt; 319 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 320 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 321 ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr); 322 ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr); 323 const CeedScalar *du; 324 CeedScalar *dv; 325 if (emode != CEED_EVAL_WEIGHT) { 326 ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr); 327 } else if (emode != CEED_EVAL_WEIGHT) { 328 // LCOV_EXCL_START 329 return CeedError(ceed, CEED_ERROR_BACKEND, 330 "An input vector is required for this CeedEvalMode"); 331 // LCOV_EXCL_STOP 332 } 333 ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr); 334 335 CeedBasisNonTensor_Magma *impl; 336 ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 337 338 CeedDebug(ceed, "\033[01m[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT 339 ", comp = %" CeedInt_FMT, ncomp*ndof, ncomp); 340 341 if (tmode == CEED_TRANSPOSE) { 342 CeedSize length; 343 ierr = CeedVectorGetLength(V, &length); 344 if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 345 magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) dv, length, 346 data->queue); 347 } else { 348 magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) dv, length, 349 data->queue); 350 } 351 ceed_magma_queue_sync( data->queue ); 352 } 353 354 switch (emode) { 355 case CEED_EVAL_INTERP: { 356 CeedInt P = ndof, Q = nqpt; 357 if (tmode == CEED_TRANSPOSE) 358 magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, 359 P, nelem*ncomp, Q, 360 1.0, (double *)impl->dinterp, P, 361 (double *)du, Q, 362 0.0, (double *)dv, P, data->queue); 363 else 364 magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, 365 Q, nelem*ncomp, P, 366 1.0, (double *)impl->dinterp, P, 367 (double *)du, P, 368 0.0, (double *)dv, Q, data->queue); 369 } 370 break; 371 372 case CEED_EVAL_GRAD: { 373 CeedInt P = ndof, Q = nqpt; 374 if (tmode == CEED_TRANSPOSE) { 375 CeedScalar beta = 0.0; 376 for(int d=0; d<dim; d++) { 377 if (d>0) 378 beta = 1.0; 379 magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, 380 P, nelem*ncomp, Q, 381 1.0, (double *)(impl->dgrad + d*P*Q), P, 382 (double *)(du + d*nelem*ncomp*Q), Q, 383 beta, (double *)dv, P, data->queue); 384 } 385 } else { 386 for(int d=0; d< dim; d++) 387 magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, 388 Q, nelem*ncomp, P, 389 1.0, (double *)(impl->dgrad + d*P*Q), P, 390 (double *)du, P, 391 0.0, (double *)(dv + d*nelem*ncomp*Q), Q, data->queue); 392 } 393 } 394 break; 395 396 case CEED_EVAL_WEIGHT: { 397 if (tmode == CEED_TRANSPOSE) 398 // LCOV_EXCL_START 399 return CeedError(ceed, CEED_ERROR_BACKEND, 400 "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 401 // LCOV_EXCL_STOP 402 403 int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1; 404 int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)? 405 1 : 0 ); 406 magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, 407 data->queue); 408 CeedChkBackend(ierr); 409 } 410 break; 411 412 // LCOV_EXCL_START 413 case CEED_EVAL_DIV: 414 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 415 case CEED_EVAL_CURL: 416 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 417 case CEED_EVAL_NONE: 418 return CeedError(ceed, CEED_ERROR_BACKEND, 419 "CEED_EVAL_NONE does not make sense in this context"); 420 // LCOV_EXCL_STOP 421 } 422 423 // must sync to ensure completeness 424 ceed_magma_queue_sync( data->queue ); 425 426 if (emode!=CEED_EVAL_WEIGHT) { 427 ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr); 428 } 429 ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr); 430 return CEED_ERROR_SUCCESS; 431 } 432 433 int CeedBasisApplyNonTensor_f32_Magma(CeedBasis basis, CeedInt nelem, 434 CeedTransposeMode tmode, CeedEvalMode emode, 435 CeedVector U, CeedVector V) { 436 int ierr; 437 Ceed ceed; 438 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 439 440 Ceed_Magma *data; 441 ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 442 443 CeedInt dim, ncomp, ndof, nqpt; 444 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 445 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 446 ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr); 447 ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr); 448 const CeedScalar *du; 449 CeedScalar *dv; 450 if (emode != CEED_EVAL_WEIGHT) { 451 ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr); 452 } else if (emode != CEED_EVAL_WEIGHT) { 453 // LCOV_EXCL_START 454 return CeedError(ceed, CEED_ERROR_BACKEND, 455 "An input vector is required for this CeedEvalMode"); 456 // LCOV_EXCL_STOP 457 } 458 ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr); 459 460 CeedBasisNonTensor_Magma *impl; 461 ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 462 463 CeedDebug(ceed, "\033[01m[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT 464 ", comp = %" CeedInt_FMT, ncomp*ndof, ncomp); 465 466 if (tmode == CEED_TRANSPOSE) { 467 CeedSize length; 468 ierr = CeedVectorGetLength(V, &length); 469 if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 470 magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) dv, length, 471 data->queue); 472 } else { 473 magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) dv, length, 474 data->queue); 475 } 476 ceed_magma_queue_sync( data->queue ); 477 } 478 479 switch (emode) { 480 case CEED_EVAL_INTERP: { 481 CeedInt P = ndof, Q = nqpt; 482 if (tmode == CEED_TRANSPOSE) 483 magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans, 484 P, nelem*ncomp, Q, 485 1.0, (float *)impl->dinterp, P, 486 (float *)du, Q, 487 0.0, (float *)dv, P, data->queue); 488 else 489 magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans, 490 Q, nelem*ncomp, P, 491 1.0, (float *)impl->dinterp, P, 492 (float *)du, P, 493 0.0, (float *)dv, Q, data->queue); 494 } 495 break; 496 497 case CEED_EVAL_GRAD: { 498 CeedInt P = ndof, Q = nqpt; 499 if (tmode == CEED_TRANSPOSE) { 500 CeedScalar beta = 0.0; 501 for(int d=0; d<dim; d++) { 502 if (d>0) 503 beta = 1.0; 504 magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans, 505 P, nelem*ncomp, Q, 506 1.0, (float *)(impl->dgrad + d*P*Q), P, 507 (float *)(du + d*nelem*ncomp*Q), Q, 508 beta, (float *)dv, P, data->queue); 509 } 510 } else { 511 for(int d=0; d< dim; d++) 512 magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans, 513 Q, nelem*ncomp, P, 514 1.0, (float *)(impl->dgrad + d*P*Q), P, 515 (float *)du, P, 516 0.0, (float *)(dv + d*nelem*ncomp*Q), Q, data->queue); 517 } 518 } 519 break; 520 521 case CEED_EVAL_WEIGHT: { 522 if (tmode == CEED_TRANSPOSE) 523 // LCOV_EXCL_START 524 return CeedError(ceed, CEED_ERROR_BACKEND, 525 "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 526 // LCOV_EXCL_STOP 527 528 int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1; 529 int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)? 530 1 : 0 ); 531 magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, 532 data->queue); 533 CeedChkBackend(ierr); 534 } 535 break; 536 537 // LCOV_EXCL_START 538 case CEED_EVAL_DIV: 539 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 540 case CEED_EVAL_CURL: 541 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 542 case CEED_EVAL_NONE: 543 return CeedError(ceed, CEED_ERROR_BACKEND, 544 "CEED_EVAL_NONE does not make sense in this context"); 545 // LCOV_EXCL_STOP 546 } 547 548 // must sync to ensure completeness 549 ceed_magma_queue_sync( data->queue ); 550 551 if (emode!=CEED_EVAL_WEIGHT) { 552 ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr); 553 } 554 ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr); 555 return CEED_ERROR_SUCCESS; 556 } 557 558 #ifdef __cplusplus 559 CEED_INTERN "C" 560 #endif 561 int CeedBasisDestroy_Magma(CeedBasis basis) { 562 int ierr; 563 CeedBasis_Magma *impl; 564 ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 565 566 ierr = magma_free(impl->dqref1d); CeedChkBackend(ierr); 567 ierr = magma_free(impl->dinterp1d); CeedChkBackend(ierr); 568 ierr = magma_free(impl->dgrad1d); CeedChkBackend(ierr); 569 ierr = magma_free(impl->dqweight1d); CeedChkBackend(ierr); 570 Ceed ceed; 571 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 572 #ifdef CEED_MAGMA_USE_HIP 573 ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr); 574 #else 575 ierr = cuModuleUnload(impl->module); CeedChk_Cu(ceed, ierr); 576 #endif 577 578 ierr = CeedFree(&impl); CeedChkBackend(ierr); 579 580 return CEED_ERROR_SUCCESS; 581 } 582 583 #ifdef __cplusplus 584 CEED_INTERN "C" 585 #endif 586 int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { 587 int ierr; 588 CeedBasisNonTensor_Magma *impl; 589 ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); 590 591 ierr = magma_free(impl->dqref); CeedChkBackend(ierr); 592 ierr = magma_free(impl->dinterp); CeedChkBackend(ierr); 593 ierr = magma_free(impl->dgrad); CeedChkBackend(ierr); 594 ierr = magma_free(impl->dqweight); CeedChkBackend(ierr); 595 596 ierr = CeedFree(&impl); CeedChkBackend(ierr); 597 598 return CEED_ERROR_SUCCESS; 599 } 600 601 #ifdef __cplusplus 602 CEED_INTERN "C" 603 #endif 604 int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, 605 const CeedScalar *interp1d, 606 const CeedScalar *grad1d, 607 const CeedScalar *qref1d, 608 const CeedScalar *qweight1d, CeedBasis basis) { 609 int ierr; 610 CeedBasis_Magma *impl; 611 ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr); 612 Ceed ceed; 613 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 614 615 // Check for supported parameters 616 CeedInt ncomp = 0; 617 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 618 Ceed_Magma *data; 619 ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 620 621 // Compile kernels 622 char *magma_common_path; 623 char *interp_path, *grad_path, *weight_path; 624 char *basis_kernel_source; 625 ierr = CeedGetJitAbsolutePath(ceed, 626 "ceed/jit-source/magma/magma_common_device.h", 627 &magma_common_path); CeedChkBackend(ierr); 628 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 629 ierr = CeedLoadSourceToBuffer(ceed, magma_common_path, 630 &basis_kernel_source); 631 CeedChkBackend(ierr); 632 char *interp_name_base = "ceed/jit-source/magma/interp"; 633 CeedInt interp_name_len = strlen(interp_name_base) + 6; 634 char interp_name[interp_name_len]; 635 snprintf(interp_name, interp_name_len, "%s-%" CeedInt_FMT "d.h", 636 interp_name_base, dim); 637 ierr = CeedGetJitAbsolutePath(ceed, interp_name, &interp_path); 638 CeedChkBackend(ierr); 639 ierr = CeedLoadSourceToInitializedBuffer(ceed, interp_path, 640 &basis_kernel_source); 641 CeedChkBackend(ierr); 642 char *grad_name_base = "ceed/jit-source/magma/grad"; 643 CeedInt grad_name_len = strlen(grad_name_base) + 6; 644 char grad_name[grad_name_len]; 645 snprintf(grad_name, grad_name_len, "%s-%" CeedInt_FMT "d.h", grad_name_base, 646 dim); 647 ierr = CeedGetJitAbsolutePath(ceed, grad_name, &grad_path); 648 CeedChkBackend(ierr); 649 ierr = CeedLoadSourceToInitializedBuffer(ceed, grad_path, 650 &basis_kernel_source); 651 CeedChkBackend(ierr); 652 char *weight_name_base = "ceed/jit-source/magma/weight"; 653 CeedInt weight_name_len = strlen(weight_name_base) + 6; 654 char weight_name[weight_name_len]; 655 snprintf(weight_name, weight_name_len, "%s-%" CeedInt_FMT "d.h", 656 weight_name_base, dim); 657 ierr = CeedGetJitAbsolutePath(ceed, weight_name, &weight_path); 658 CeedChkBackend(ierr); 659 ierr = CeedLoadSourceToInitializedBuffer(ceed, weight_path, 660 &basis_kernel_source); 661 CeedChkBackend(ierr); 662 CeedDebug256(ceed, 2, 663 "----- Loading Basis Kernel Source Complete! -----\n"); 664 // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip 665 // data 666 Ceed delegate; 667 ierr = CeedGetDelegate(ceed, &delegate); CeedChkBackend(ierr); 668 ierr = CeedCompileMagma(delegate, basis_kernel_source, &impl->module, 5, 669 "DIM", dim, 670 "NCOMP", ncomp, 671 "P", P1d, 672 "Q", Q1d, 673 "MAXPQ", CeedIntMax(P1d, Q1d)); 674 CeedChkBackend(ierr); 675 676 // Kernel setup 677 switch (dim) { 678 case 1: 679 ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", 680 &impl->magma_interp); 681 CeedChkBackend(ierr); 682 ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", 683 &impl->magma_interp_tr); 684 CeedChkBackend(ierr); 685 ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", 686 &impl->magma_grad); 687 CeedChkBackend(ierr); 688 ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", 689 &impl->magma_grad_tr); 690 CeedChkBackend(ierr); 691 ierr = CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", 692 &impl->magma_weight); 693 CeedChkBackend(ierr); 694 break; 695 case 2: 696 ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", 697 &impl->magma_interp); 698 CeedChkBackend(ierr); 699 ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", 700 &impl->magma_interp_tr); 701 CeedChkBackend(ierr); 702 ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", 703 &impl->magma_grad); 704 CeedChkBackend(ierr); 705 ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", 706 &impl->magma_grad_tr); 707 CeedChkBackend(ierr); 708 ierr = CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", 709 &impl->magma_weight); 710 CeedChkBackend(ierr); 711 break; 712 case 3: 713 ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", 714 &impl->magma_interp); 715 CeedChkBackend(ierr); 716 ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", 717 &impl->magma_interp_tr); 718 CeedChkBackend(ierr); 719 ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", 720 &impl->magma_grad); 721 CeedChkBackend(ierr); 722 ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", 723 &impl->magma_grad_tr); 724 CeedChkBackend(ierr); 725 ierr = CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", 726 &impl->magma_weight); 727 CeedChkBackend(ierr); 728 } 729 730 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 731 CeedBasisApply_Magma); CeedChkBackend(ierr); 732 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 733 CeedBasisDestroy_Magma); CeedChkBackend(ierr); 734 735 // Copy qref1d to the GPU 736 ierr = magma_malloc((void **)&impl->dqref1d, Q1d*sizeof(qref1d[0])); 737 CeedChkBackend(ierr); 738 magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1, 739 data->queue); 740 741 // Copy interp1d to the GPU 742 ierr = magma_malloc((void **)&impl->dinterp1d, Q1d*P1d*sizeof(interp1d[0])); 743 CeedChkBackend(ierr); 744 magma_setvector(Q1d*P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1, 745 data->queue); 746 747 // Copy grad1d to the GPU 748 ierr = magma_malloc((void **)&impl->dgrad1d, Q1d*P1d*sizeof(grad1d[0])); 749 CeedChkBackend(ierr); 750 magma_setvector(Q1d*P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1, 751 data->queue); 752 753 // Copy qweight1d to the GPU 754 ierr = magma_malloc((void **)&impl->dqweight1d, Q1d*sizeof(qweight1d[0])); 755 CeedChkBackend(ierr); 756 magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1, 757 data->queue); 758 759 ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr); 760 761 return CEED_ERROR_SUCCESS; 762 } 763 764 #ifdef __cplusplus 765 CEED_INTERN "C" 766 #endif 767 int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, 768 CeedInt nqpts, const CeedScalar *interp, 769 const CeedScalar *grad, const CeedScalar *qref, 770 const CeedScalar *qweight, CeedBasis basis) { 771 int ierr; 772 CeedBasisNonTensor_Magma *impl; 773 Ceed ceed; 774 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 775 776 Ceed_Magma *data; 777 ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 778 779 if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) { 780 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 781 CeedBasisApplyNonTensor_f64_Magma); 782 CeedChkBackend(ierr); 783 } else { 784 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 785 CeedBasisApplyNonTensor_f32_Magma); 786 CeedChkBackend(ierr); 787 } 788 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 789 CeedBasisDestroyNonTensor_Magma); CeedChkBackend(ierr); 790 791 ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr); 792 ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr); 793 794 // Copy qref to the GPU 795 ierr = magma_malloc((void **)&impl->dqref, nqpts*sizeof(qref[0])); 796 CeedChkBackend(ierr); 797 magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue); 798 799 // Copy interp to the GPU 800 ierr = magma_malloc((void **)&impl->dinterp, nqpts*ndof*sizeof(interp[0])); 801 CeedChkBackend(ierr); 802 magma_setvector(nqpts*ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1, 803 data->queue); 804 805 // Copy grad to the GPU 806 ierr = magma_malloc((void **)&impl->dgrad, nqpts*ndof*dim*sizeof(grad[0])); 807 CeedChkBackend(ierr); 808 magma_setvector(nqpts*ndof*dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1, 809 data->queue); 810 811 // Copy qweight to the GPU 812 ierr = magma_malloc((void **)&impl->dqweight, nqpts*sizeof(qweight[0])); 813 CeedChkBackend(ierr); 814 magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1, 815 data->queue); 816 817 return CEED_ERROR_SUCCESS; 818 } 819