1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include "ceed-magma.h" 18 19 #ifdef __cplusplus 20 CEED_INTERN "C" 21 #endif 22 int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, 23 CeedTransposeMode tmode, CeedEvalMode emode, 24 CeedVector U, CeedVector V) { 25 int ierr; 26 Ceed ceed; 27 ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); 28 CeedInt dim, ncomp, ndof, nqpt; 29 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 30 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChk(ierr); 31 ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChk(ierr); 32 ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChk(ierr); 33 const CeedScalar *u; 34 CeedScalar *v; 35 if (emode != CEED_EVAL_WEIGHT) { 36 ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u); CeedChk(ierr); 37 } else if (emode != CEED_EVAL_WEIGHT) { 38 // LCOV_EXCL_START 39 return CeedError(ceed, 1, 40 "An input vector is required for this CeedEvalMode"); 41 // LCOV_EXCL_STOP 42 } 43 ierr = CeedVectorGetArray(V, CEED_MEM_DEVICE, &v); CeedChk(ierr); 44 45 CeedBasis_Magma *impl; 46 ierr = CeedBasisGetData(basis, (void *)&impl); CeedChk(ierr); 47 48 CeedInt P1d, Q1d; 49 ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 50 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 51 52 CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d, comp = %d", 53 ncomp*CeedIntPow(P1d, dim), ncomp); 54 55 if (tmode == CEED_TRANSPOSE) { 56 CeedInt length; 57 ierr = CeedVectorGetLength(V, &length); 58 magmablas_dlaset(MagmaFull, length, 1, 0., 0., v, length); 59 } 60 switch (emode) { 61 case CEED_EVAL_INTERP: { 62 CeedInt P = P1d, Q = Q1d; 63 if (tmode == CEED_TRANSPOSE) { 64 P = Q1d; Q = P1d; 65 } 66 67 // Define element sizes for dofs/quad 68 CeedInt elquadsize = CeedIntPow(Q1d, dim); 69 CeedInt eldofssize = CeedIntPow(P1d, dim); 70 71 // E-vector ordering -------------- Q-vector ordering 72 // component component 73 // elem elem 74 // node node 75 76 // --- Define strides for NOTRANSPOSE mode: --- 77 // Input (u) is E-vector, output (v) is Q-vector 78 79 // Element strides 80 CeedInt u_elstride = eldofssize; 81 CeedInt v_elstride = elquadsize; 82 // Component strides 83 CeedInt u_compstride = nelem * eldofssize; 84 CeedInt v_compstride = nelem * elquadsize; 85 86 // --- Swap strides for TRANSPOSE mode: --- 87 if (tmode == CEED_TRANSPOSE) { 88 // Input (u) is Q-vector, output (v) is E-vector 89 // Element strides 90 v_elstride = eldofssize; 91 u_elstride = elquadsize; 92 // Component strides 93 v_compstride = nelem * eldofssize; 94 u_compstride = nelem * elquadsize; 95 } 96 97 // Loop through components and apply batch over elements 98 magmablas_dbasis_apply_batched_eval_interp(P, Q, dim, ncomp, 99 impl->dinterp1d, tmode, 100 u, u_elstride, u_compstride, 101 v, v_elstride, v_compstride, 102 nelem); 103 } 104 break; 105 case CEED_EVAL_GRAD: { 106 CeedInt P = P1d, Q = Q1d; 107 // In CEED_NOTRANSPOSE mode: 108 // u is (P^dim x nc), column-major layout (nc = ncomp) 109 // v is (Q^dim x nc x dim), column-major layout (nc = ncomp) 110 // In CEED_TRANSPOSE mode, the sizes of u and v are switched. 111 if (tmode == CEED_TRANSPOSE) { 112 P = Q1d, Q = P1d; 113 } 114 115 // Define element sizes for dofs/quad 116 CeedInt elquadsize = CeedIntPow(Q1d, dim); 117 CeedInt eldofssize = CeedIntPow(P1d, dim); 118 119 // E-vector ordering -------------- Q-vector ordering 120 // dim 121 // component component 122 // elem elem 123 // node node 124 125 126 // --- Define strides for NOTRANSPOSE mode: --- 127 // Input (u) is E-vector, output (v) is Q-vector 128 129 // Element strides 130 CeedInt u_elstride = eldofssize; 131 CeedInt v_elstride = elquadsize; 132 // Component strides 133 CeedInt u_compstride = nelem * eldofssize; 134 CeedInt v_compstride = nelem * elquadsize; 135 // Dimension strides 136 CeedInt u_dimstride = 0; 137 CeedInt v_dimstride = nelem * elquadsize * ncomp; 138 139 // --- Swap strides for TRANSPOSE mode: --- 140 if (tmode == CEED_TRANSPOSE) { 141 // Input (u) is Q-vector, output (v) is E-vector 142 // Element strides 143 v_elstride = eldofssize; 144 u_elstride = elquadsize; 145 // Component strides 146 v_compstride = nelem * eldofssize; 147 u_compstride = nelem * elquadsize; 148 // Dimension strides 149 v_dimstride = 0; 150 u_dimstride = nelem * elquadsize * ncomp; 151 152 } 153 154 // Loop through grad dimensions only, batch call over elements and components 155 for (CeedInt dim_ctr = 0; dim_ctr < dim; dim_ctr++) 156 magmablas_dbasis_apply_batched_eval_grad(P, Q, dim, ncomp, nqpt, 157 impl->dinterp1d, impl->dgrad1d, tmode, 158 u + dim_ctr * u_dimstride, u_elstride, u_compstride, u_dimstride, 159 v + dim_ctr * v_dimstride, v_elstride, v_compstride, v_dimstride, 160 dim_ctr, nelem); 161 } 162 break; 163 case CEED_EVAL_WEIGHT: { 164 if (tmode == CEED_TRANSPOSE) 165 // LCOV_EXCL_START 166 return CeedError(ceed, 1, 167 "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 168 // LCOV_EXCL_STOP 169 CeedInt Q = Q1d; 170 int eldofssize = CeedIntPow(Q, dim); 171 magmablas_dbasis_apply_batched_eval_weight(Q, dim, impl->dqweight1d, 172 v, eldofssize, 173 nelem); 174 } 175 break; 176 // LCOV_EXCL_START 177 case CEED_EVAL_DIV: 178 return CeedError(ceed, 1, "CEED_EVAL_DIV not supported"); 179 case CEED_EVAL_CURL: 180 return CeedError(ceed, 1, "CEED_EVAL_CURL not supported"); 181 case CEED_EVAL_NONE: 182 return CeedError(ceed, 1, 183 "CEED_EVAL_NONE does not make sense in this context"); 184 // LCOV_EXCL_STOP 185 } 186 187 if (emode!=CEED_EVAL_WEIGHT) { 188 ierr = CeedVectorRestoreArrayRead(U, &u); CeedChk(ierr); 189 } 190 ierr = CeedVectorRestoreArray(V, &v); CeedChk(ierr); 191 return 0; 192 } 193 194 #ifdef __cplusplus 195 CEED_INTERN "C" 196 #endif 197 int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt nelem, 198 CeedTransposeMode tmode, CeedEvalMode emode, 199 CeedVector U, CeedVector V) { 200 int ierr; 201 Ceed ceed; 202 ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); 203 CeedInt dim, ncomp, ndof, nqpt; 204 ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 205 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChk(ierr); 206 ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChk(ierr); 207 ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChk(ierr); 208 const CeedScalar *du; 209 CeedScalar *dv; 210 if (emode != CEED_EVAL_WEIGHT) { 211 ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChk(ierr); 212 } else if (emode != CEED_EVAL_WEIGHT) { 213 // LCOV_EXCL_START 214 return CeedError(ceed, 1, 215 "An input vector is required for this CeedEvalMode"); 216 // LCOV_EXCL_STOP 217 } 218 ierr = CeedVectorGetArray(V, CEED_MEM_DEVICE, &dv); CeedChk(ierr); 219 220 CeedBasisNonTensor_Magma *impl; 221 ierr = CeedBasisGetData(basis, (void *)&impl); CeedChk(ierr); 222 223 CeedDebug("\033[01m[CeedBasisApplyNonTensor_Magma] vsize=%d, comp = %d", 224 ncomp*ndof, ncomp); 225 226 if (tmode == CEED_TRANSPOSE) { 227 CeedInt length; 228 ierr = CeedVectorGetLength(V, &length); 229 magmablas_dlaset(MagmaFull, length, 1, 0., 0., dv, length); 230 } 231 switch (emode) { 232 case CEED_EVAL_INTERP: { 233 CeedInt P = ndof, Q = nqpt; 234 if (tmode == CEED_TRANSPOSE) 235 magma_dgemm(MagmaNoTrans, MagmaNoTrans, 236 P, nelem*ncomp, Q, 237 1.0, impl->dinterp, P, 238 du, Q, 239 0.0, dv, P); 240 else 241 magma_dgemm(MagmaTrans, MagmaNoTrans, 242 Q, nelem*ncomp, P, 243 1.0, impl->dinterp, P, 244 du, P, 245 0.0, dv, Q); 246 } 247 break; 248 249 case CEED_EVAL_GRAD: { 250 CeedInt P = ndof, Q = nqpt; 251 if (tmode == CEED_TRANSPOSE) { 252 double beta = 0.0; 253 for(int d=0; d<dim; d++) { 254 if (d>0) 255 beta = 1.0; 256 magma_dgemm(MagmaNoTrans, MagmaNoTrans, 257 P, nelem*ncomp, Q, 258 1.0, impl->dgrad + d*P*Q, P, 259 du + d*nelem*ncomp*Q, Q, 260 beta, dv, P); 261 } 262 } else { 263 for(int d=0; d< dim; d++) 264 magma_dgemm(MagmaTrans, MagmaNoTrans, 265 Q, nelem*ncomp, P, 266 1.0, impl->dgrad + d*P*Q, P, 267 du, P, 268 0.0, dv + d*nelem*ncomp*Q, Q); 269 } 270 } 271 break; 272 273 case CEED_EVAL_WEIGHT: { 274 if (tmode == CEED_TRANSPOSE) 275 // LCOV_EXCL_START 276 return CeedError(ceed, 1, 277 "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 278 // LCOV_EXCL_STOP 279 280 int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1; 281 int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)? 282 1 : 0 ); 283 magma_weight(grid, nqpt, nelem, nqpt, impl->dqweight, dv); 284 CeedChk(ierr); 285 } 286 break; 287 288 // LCOV_EXCL_START 289 case CEED_EVAL_DIV: 290 return CeedError(ceed, 1, "CEED_EVAL_DIV not supported"); 291 case CEED_EVAL_CURL: 292 return CeedError(ceed, 1, "CEED_EVAL_CURL not supported"); 293 case CEED_EVAL_NONE: 294 return CeedError(ceed, 1, 295 "CEED_EVAL_NONE does not make sense in this context"); 296 // LCOV_EXCL_STOP 297 } 298 299 if(emode!=CEED_EVAL_WEIGHT) { 300 ierr = CeedVectorRestoreArrayRead(U, &du); CeedChk(ierr); 301 } 302 ierr = CeedVectorRestoreArray(V, &dv); CeedChk(ierr); 303 return 0; 304 } 305 306 #ifdef __cplusplus 307 CEED_INTERN "C" 308 #endif 309 int CeedBasisDestroy_Magma(CeedBasis basis) { 310 int ierr; 311 CeedBasis_Magma *impl; 312 ierr = CeedBasisGetData(basis, (void *)&impl); CeedChk(ierr); 313 314 ierr = magma_free(impl->dqref1d); CeedChk(ierr); 315 ierr = magma_free(impl->dinterp1d); CeedChk(ierr); 316 ierr = magma_free(impl->dgrad1d); CeedChk(ierr); 317 ierr = magma_free(impl->dqweight1d); CeedChk(ierr); 318 319 ierr = CeedFree(&impl); CeedChk(ierr); 320 321 return 0; 322 } 323 324 #ifdef __cplusplus 325 CEED_INTERN "C" 326 #endif 327 int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { 328 int ierr; 329 CeedBasisNonTensor_Magma *impl; 330 ierr = CeedBasisGetData(basis, (void *)&impl); CeedChk(ierr); 331 332 ierr = magma_free(impl->dqref); CeedChk(ierr); 333 ierr = magma_free(impl->dinterp); CeedChk(ierr); 334 ierr = magma_free(impl->dgrad); CeedChk(ierr); 335 ierr = magma_free(impl->dqweight); CeedChk(ierr); 336 337 ierr = CeedFree(&impl); CeedChk(ierr); 338 339 return 0; 340 } 341 342 #ifdef __cplusplus 343 CEED_INTERN "C" 344 #endif 345 int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, 346 const CeedScalar *interp1d, 347 const CeedScalar *grad1d, 348 const CeedScalar *qref1d, 349 const CeedScalar *qweight1d, CeedBasis basis) { 350 int ierr; 351 CeedBasis_Magma *impl; 352 Ceed ceed; 353 ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); 354 355 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 356 CeedBasisApply_Magma); CeedChk(ierr); 357 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 358 CeedBasisDestroy_Magma); CeedChk(ierr); 359 360 ierr = CeedCalloc(1,&impl); CeedChk(ierr); 361 ierr = CeedBasisSetData(basis, (void *)&impl); CeedChk(ierr); 362 363 // Copy qref1d to the GPU 364 ierr = magma_malloc((void **)&impl->dqref1d, Q1d*sizeof(qref1d[0])); 365 CeedChk(ierr); 366 magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1); 367 368 // Copy interp1d to the GPU 369 ierr = magma_malloc((void **)&impl->dinterp1d, Q1d*P1d*sizeof(interp1d[0])); 370 CeedChk(ierr); 371 magma_setvector(Q1d*P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1); 372 373 // Copy grad1d to the GPU 374 ierr = magma_malloc((void **)&impl->dgrad1d, Q1d*P1d*sizeof(grad1d[0])); 375 CeedChk(ierr); 376 magma_setvector(Q1d*P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1); 377 378 // Copy qweight1d to the GPU 379 ierr = magma_malloc((void **)&impl->dqweight1d, Q1d*sizeof(qweight1d[0])); 380 CeedChk(ierr); 381 magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1); 382 383 return 0; 384 } 385 386 #ifdef __cplusplus 387 CEED_INTERN "C" 388 #endif 389 int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, 390 CeedInt nqpts, const CeedScalar *interp, 391 const CeedScalar *grad, const CeedScalar *qref, 392 const CeedScalar *qweight, CeedBasis basis) { 393 int ierr; 394 CeedBasisNonTensor_Magma *impl; 395 Ceed ceed; 396 ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); 397 398 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 399 CeedBasisApplyNonTensor_Magma); CeedChk(ierr); 400 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 401 CeedBasisDestroyNonTensor_Magma); CeedChk(ierr); 402 403 ierr = CeedCalloc(1,&impl); CeedChk(ierr); 404 ierr = CeedBasisSetData(basis, (void *)&impl); CeedChk(ierr); 405 406 // Copy qref to the GPU 407 ierr = magma_malloc((void **)&impl->dqref, nqpts*sizeof(qref[0])); 408 CeedChk(ierr); 409 magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1); 410 411 // Copy interp to the GPU 412 ierr = magma_malloc((void **)&impl->dinterp, nqpts*ndof*sizeof(interp[0])); 413 CeedChk(ierr); 414 magma_setvector(nqpts*ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1); 415 416 // Copy grad to the GPU 417 ierr = magma_malloc((void **)&impl->dgrad, nqpts*ndof*dim*sizeof(grad[0])); 418 CeedChk(ierr); 419 magma_setvector(nqpts*ndof*dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1); 420 421 // Copy qweight to the GPU 422 ierr = magma_malloc((void **)&impl->dqweight, nqpts*sizeof(qweight[0])); 423 CeedChk(ierr); 424 magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1); 425 426 return 0; 427 } 428