1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed/ceed.h> 18 #include <ceed/backend.h> 19 #include <assert.h> 20 #include <cuda.h> 21 #include <cuda_runtime.h> 22 #include <stdbool.h> 23 #include <string.h> 24 #include "ceed-cuda-ref.h" 25 #include "../cuda/ceed-cuda-compile.h" 26 27 //------------------------------------------------------------------------------ 28 // Destroy operator 29 //------------------------------------------------------------------------------ 30 static int CeedOperatorDestroy_Cuda(CeedOperator op) { 31 int ierr; 32 CeedOperator_Cuda *impl; 33 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 34 35 // Apply data 36 for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) { 37 ierr = CeedVectorDestroy(&impl->evecs[i]); CeedChkBackend(ierr); 38 } 39 ierr = CeedFree(&impl->evecs); CeedChkBackend(ierr); 40 41 for (CeedInt i = 0; i < impl->numein; i++) { 42 ierr = CeedVectorDestroy(&impl->qvecsin[i]); CeedChkBackend(ierr); 43 } 44 ierr = CeedFree(&impl->qvecsin); CeedChkBackend(ierr); 45 46 for (CeedInt i = 0; i < impl->numeout; i++) { 47 ierr = CeedVectorDestroy(&impl->qvecsout[i]); CeedChkBackend(ierr); 48 } 49 ierr = CeedFree(&impl->qvecsout); CeedChkBackend(ierr); 50 51 // QFunction assembly data 52 for (CeedInt i=0; i<impl->qfnumactivein; i++) { 53 ierr = CeedVectorDestroy(&impl->qfactivein[i]); CeedChkBackend(ierr); 54 } 55 ierr = CeedFree(&impl->qfactivein); CeedChkBackend(ierr); 56 57 // Diag data 58 if (impl->diag) { 59 Ceed ceed; 60 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 61 CeedChk_Cu(ceed, cuModuleUnload(impl->diag->module)); 62 ierr = CeedFree(&impl->diag->h_emodein); CeedChkBackend(ierr); 63 ierr = CeedFree(&impl->diag->h_emodeout); CeedChkBackend(ierr); 64 ierr = cudaFree(impl->diag->d_emodein); CeedChk_Cu(ceed, ierr); 65 ierr = cudaFree(impl->diag->d_emodeout); CeedChk_Cu(ceed, ierr); 66 ierr = cudaFree(impl->diag->d_identity); CeedChk_Cu(ceed, ierr); 67 ierr = cudaFree(impl->diag->d_interpin); CeedChk_Cu(ceed, ierr); 68 ierr = cudaFree(impl->diag->d_interpout); CeedChk_Cu(ceed, ierr); 69 ierr = cudaFree(impl->diag->d_gradin); CeedChk_Cu(ceed, ierr); 70 ierr = cudaFree(impl->diag->d_gradout); CeedChk_Cu(ceed, ierr); 71 ierr = CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr); 72 CeedChkBackend(ierr); 73 ierr = CeedVectorDestroy(&impl->diag->elemdiag); CeedChkBackend(ierr); 74 ierr = CeedVectorDestroy(&impl->diag->pbelemdiag); CeedChkBackend(ierr); 75 } 76 ierr = CeedFree(&impl->diag); CeedChkBackend(ierr); 77 78 if (impl->asmb) { 79 Ceed ceed; 80 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 81 CeedChk_Cu(ceed, cuModuleUnload(impl->asmb->module)); 82 ierr = cudaFree(impl->asmb->d_B_in); CeedChk_Cu(ceed, ierr); 83 ierr = cudaFree(impl->asmb->d_B_out); CeedChk_Cu(ceed, ierr); 84 } 85 ierr = CeedFree(&impl->asmb); CeedChkBackend(ierr); 86 87 ierr = CeedFree(&impl); CeedChkBackend(ierr); 88 return CEED_ERROR_SUCCESS; 89 } 90 91 //------------------------------------------------------------------------------ 92 // Setup infields or outfields 93 //------------------------------------------------------------------------------ 94 static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, 95 bool isinput, CeedVector *evecs, 96 CeedVector *qvecs, CeedInt starte, 97 CeedInt numfields, CeedInt Q, 98 CeedInt numelements) { 99 CeedInt dim, ierr, size; 100 Ceed ceed; 101 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 102 CeedBasis basis; 103 CeedElemRestriction Erestrict; 104 CeedOperatorField *opfields; 105 CeedQFunctionField *qffields; 106 CeedVector fieldvec; 107 bool strided; 108 bool skiprestrict; 109 110 if (isinput) { 111 ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL); 112 CeedChkBackend(ierr); 113 ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL); 114 CeedChkBackend(ierr); 115 } else { 116 ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields); 117 CeedChkBackend(ierr); 118 ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields); 119 CeedChkBackend(ierr); 120 } 121 122 // Loop over fields 123 for (CeedInt i = 0; i < numfields; i++) { 124 CeedEvalMode emode; 125 ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr); 126 127 strided = false; 128 skiprestrict = false; 129 if (emode != CEED_EVAL_WEIGHT) { 130 ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict); 131 CeedChkBackend(ierr); 132 133 // Check whether this field can skip the element restriction: 134 // must be passive input, with emode NONE, and have a strided restriction with 135 // CEED_STRIDES_BACKEND. 136 137 // First, check whether the field is input or output: 138 if (isinput) { 139 // Check for passive input: 140 ierr = CeedOperatorFieldGetVector(opfields[i], &fieldvec); CeedChkBackend(ierr); 141 if (fieldvec != CEED_VECTOR_ACTIVE) { 142 // Check emode 143 if (emode == CEED_EVAL_NONE) { 144 // Check for strided restriction 145 ierr = CeedElemRestrictionIsStrided(Erestrict, &strided); 146 CeedChkBackend(ierr); 147 if (strided) { 148 // Check if vector is already in preferred backend ordering 149 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, 150 &skiprestrict); CeedChkBackend(ierr); 151 } 152 } 153 } 154 } 155 if (skiprestrict) { 156 // We do not need an E-Vector, but will use the input field vector's data 157 // directly in the operator application. 158 evecs[i + starte] = NULL; 159 } else { 160 ierr = CeedElemRestrictionCreateVector(Erestrict, NULL, 161 &evecs[i + starte]); 162 CeedChkBackend(ierr); 163 } 164 } 165 166 switch (emode) { 167 case CEED_EVAL_NONE: 168 ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr); 169 ierr = CeedVectorCreate(ceed, numelements * Q * size, &qvecs[i]); 170 CeedChkBackend(ierr); 171 break; 172 case CEED_EVAL_INTERP: 173 ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr); 174 ierr = CeedVectorCreate(ceed, numelements * Q * size, &qvecs[i]); 175 CeedChkBackend(ierr); 176 break; 177 case CEED_EVAL_GRAD: 178 ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr); 179 ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr); 180 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 181 ierr = CeedVectorCreate(ceed, numelements * Q * size, &qvecs[i]); 182 CeedChkBackend(ierr); 183 break; 184 case CEED_EVAL_WEIGHT: // Only on input fields 185 ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr); 186 ierr = CeedVectorCreate(ceed, numelements * Q, &qvecs[i]); CeedChkBackend(ierr); 187 ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, 188 CEED_EVAL_WEIGHT, NULL, qvecs[i]); CeedChkBackend(ierr); 189 break; 190 case CEED_EVAL_DIV: 191 break; // TODO: Not implemented 192 case CEED_EVAL_CURL: 193 break; // TODO: Not implemented 194 } 195 } 196 return CEED_ERROR_SUCCESS; 197 } 198 199 //------------------------------------------------------------------------------ 200 // CeedOperator needs to connect all the named fields (be they active or passive) 201 // to the named inputs and outputs of its CeedQFunction. 202 //------------------------------------------------------------------------------ 203 static int CeedOperatorSetup_Cuda(CeedOperator op) { 204 int ierr; 205 bool setupdone; 206 ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); 207 if (setupdone) 208 return CEED_ERROR_SUCCESS; 209 Ceed ceed; 210 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 211 CeedOperator_Cuda *impl; 212 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 213 CeedQFunction qf; 214 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 215 CeedInt Q, numelements, numinputfields, numoutputfields; 216 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 217 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 218 CeedChkBackend(ierr); 219 CeedOperatorField *opinputfields, *opoutputfields; 220 ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, 221 &numoutputfields, &opoutputfields); 222 CeedChkBackend(ierr); 223 CeedQFunctionField *qfinputfields, *qfoutputfields; 224 ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 225 CeedChkBackend(ierr); 226 227 // Allocate 228 ierr = CeedCalloc(numinputfields + numoutputfields, &impl->evecs); 229 CeedChkBackend(ierr); 230 231 ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin); CeedChkBackend(ierr); 232 ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout); CeedChkBackend(ierr); 233 234 impl->numein = numinputfields; impl->numeout = numoutputfields; 235 236 // Set up infield and outfield evecs and qvecs 237 // Infields 238 ierr = CeedOperatorSetupFields_Cuda(qf, op, true, 239 impl->evecs, impl->qvecsin, 0, 240 numinputfields, Q, numelements); 241 CeedChkBackend(ierr); 242 243 // Outfields 244 ierr = CeedOperatorSetupFields_Cuda(qf, op, false, 245 impl->evecs, impl->qvecsout, 246 numinputfields, numoutputfields, Q, 247 numelements); CeedChkBackend(ierr); 248 249 ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 250 return CEED_ERROR_SUCCESS; 251 } 252 253 //------------------------------------------------------------------------------ 254 // Setup Operator Inputs 255 //------------------------------------------------------------------------------ 256 static inline int CeedOperatorSetupInputs_Cuda(CeedInt numinputfields, 257 CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, 258 CeedVector invec, const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX], 259 CeedOperator_Cuda *impl, CeedRequest *request) { 260 CeedInt ierr; 261 CeedEvalMode emode; 262 CeedVector vec; 263 CeedElemRestriction Erestrict; 264 265 for (CeedInt i = 0; i < numinputfields; i++) { 266 // Get input vector 267 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 268 if (vec == CEED_VECTOR_ACTIVE) { 269 if (skipactive) 270 continue; 271 else 272 vec = invec; 273 } 274 275 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 276 CeedChkBackend(ierr); 277 if (emode == CEED_EVAL_WEIGHT) { // Skip 278 } else { 279 // Get input vector 280 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 281 // Get input element restriction 282 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 283 CeedChkBackend(ierr); 284 if (vec == CEED_VECTOR_ACTIVE) 285 vec = invec; 286 // Restrict, if necessary 287 if (!impl->evecs[i]) { 288 // No restriction for this field; read data directly from vec. 289 ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, 290 (const CeedScalar **) &edata[i]); 291 CeedChkBackend(ierr); 292 } else { 293 ierr = CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec, 294 impl->evecs[i], request); CeedChkBackend(ierr); 295 // Get evec 296 ierr = CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE, 297 (const CeedScalar **) &edata[i]); 298 CeedChkBackend(ierr); 299 } 300 } 301 } 302 return CEED_ERROR_SUCCESS; 303 } 304 305 //------------------------------------------------------------------------------ 306 // Input Basis Action 307 //------------------------------------------------------------------------------ 308 static inline int CeedOperatorInputBasis_Cuda(CeedInt numelements, 309 CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, 310 CeedInt numinputfields, const bool skipactive, 311 CeedScalar *edata[2*CEED_FIELD_MAX],CeedOperator_Cuda *impl) { 312 CeedInt ierr; 313 CeedInt elemsize, size; 314 CeedElemRestriction Erestrict; 315 CeedEvalMode emode; 316 CeedBasis basis; 317 318 for (CeedInt i=0; i<numinputfields; i++) { 319 // Skip active input 320 if (skipactive) { 321 CeedVector vec; 322 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 323 if (vec == CEED_VECTOR_ACTIVE) 324 continue; 325 } 326 // Get elemsize, emode, size 327 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 328 CeedChkBackend(ierr); 329 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 330 CeedChkBackend(ierr); 331 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 332 CeedChkBackend(ierr); 333 ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr); 334 // Basis action 335 switch (emode) { 336 case CEED_EVAL_NONE: 337 ierr = CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE, 338 CEED_USE_POINTER, edata[i]); CeedChkBackend(ierr); 339 break; 340 case CEED_EVAL_INTERP: 341 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); 342 CeedChkBackend(ierr); 343 ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, 344 CEED_EVAL_INTERP, impl->evecs[i], 345 impl->qvecsin[i]); CeedChkBackend(ierr); 346 break; 347 case CEED_EVAL_GRAD: 348 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); 349 CeedChkBackend(ierr); 350 ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, 351 CEED_EVAL_GRAD, impl->evecs[i], 352 impl->qvecsin[i]); CeedChkBackend(ierr); 353 break; 354 case CEED_EVAL_WEIGHT: 355 break; // No action 356 case CEED_EVAL_DIV: 357 break; // TODO: Not implemented 358 case CEED_EVAL_CURL: 359 break; // TODO: Not implemented 360 } 361 } 362 return CEED_ERROR_SUCCESS; 363 } 364 365 //------------------------------------------------------------------------------ 366 // Restore Input Vectors 367 //------------------------------------------------------------------------------ 368 static inline int CeedOperatorRestoreInputs_Cuda(CeedInt numinputfields, 369 CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, 370 const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX], 371 CeedOperator_Cuda *impl) { 372 CeedInt ierr; 373 CeedEvalMode emode; 374 CeedVector vec; 375 376 for (CeedInt i = 0; i < numinputfields; i++) { 377 // Skip active input 378 if (skipactive) { 379 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 380 if (vec == CEED_VECTOR_ACTIVE) 381 continue; 382 } 383 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 384 CeedChkBackend(ierr); 385 if (emode == CEED_EVAL_WEIGHT) { // Skip 386 } else { 387 if (!impl->evecs[i]) { // This was a skiprestrict case 388 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 389 ierr = CeedVectorRestoreArrayRead(vec, 390 (const CeedScalar **)&edata[i]); 391 CeedChkBackend(ierr); 392 } else { 393 ierr = CeedVectorRestoreArrayRead(impl->evecs[i], 394 (const CeedScalar **) &edata[i]); 395 CeedChkBackend(ierr); 396 } 397 } 398 } 399 return CEED_ERROR_SUCCESS; 400 } 401 402 //------------------------------------------------------------------------------ 403 // Apply and add to output 404 //------------------------------------------------------------------------------ 405 static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector invec, 406 CeedVector outvec, CeedRequest *request) { 407 int ierr; 408 CeedOperator_Cuda *impl; 409 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 410 CeedQFunction qf; 411 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 412 CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size; 413 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 414 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 415 CeedChkBackend(ierr); 416 CeedOperatorField *opinputfields, *opoutputfields; 417 ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, 418 &numoutputfields, &opoutputfields); 419 CeedChkBackend(ierr); 420 CeedQFunctionField *qfinputfields, *qfoutputfields; 421 ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 422 CeedChkBackend(ierr); 423 CeedEvalMode emode; 424 CeedVector vec; 425 CeedBasis basis; 426 CeedElemRestriction Erestrict; 427 CeedScalar *edata[2*CEED_FIELD_MAX] = {0}; 428 429 // Setup 430 ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr); 431 432 // Input Evecs and Restriction 433 ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields, 434 opinputfields, invec, false, edata, 435 impl, request); CeedChkBackend(ierr); 436 437 // Input basis apply if needed 438 ierr = CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields, 439 numinputfields, false, edata, impl); 440 CeedChkBackend(ierr); 441 442 // Output pointers, as necessary 443 for (CeedInt i = 0; i < numoutputfields; i++) { 444 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 445 CeedChkBackend(ierr); 446 if (emode == CEED_EVAL_NONE) { 447 // Set the output Q-Vector to use the E-Vector data directly. 448 ierr = CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE, 449 &edata[i + numinputfields]); CeedChkBackend(ierr); 450 ierr = CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE, 451 CEED_USE_POINTER, edata[i + numinputfields]); 452 CeedChkBackend(ierr); 453 } 454 } 455 456 // Q function 457 ierr = CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout); 458 CeedChkBackend(ierr); 459 460 // Output basis apply if needed 461 for (CeedInt i = 0; i < numoutputfields; i++) { 462 // Get elemsize, emode, size 463 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 464 CeedChkBackend(ierr); 465 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 466 CeedChkBackend(ierr); 467 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 468 CeedChkBackend(ierr); 469 ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size); 470 CeedChkBackend(ierr); 471 // Basis action 472 switch (emode) { 473 case CEED_EVAL_NONE: 474 break; 475 case CEED_EVAL_INTERP: 476 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); 477 CeedChkBackend(ierr); 478 ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE, 479 CEED_EVAL_INTERP, impl->qvecsout[i], 480 impl->evecs[i + impl->numein]); CeedChkBackend(ierr); 481 break; 482 case CEED_EVAL_GRAD: 483 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); 484 CeedChkBackend(ierr); 485 ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE, 486 CEED_EVAL_GRAD, impl->qvecsout[i], 487 impl->evecs[i + impl->numein]); CeedChkBackend(ierr); 488 break; 489 // LCOV_EXCL_START 490 case CEED_EVAL_WEIGHT: { 491 Ceed ceed; 492 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 493 return CeedError(ceed, CEED_ERROR_BACKEND, 494 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 495 break; // Should not occur 496 } 497 case CEED_EVAL_DIV: 498 break; // TODO: Not implemented 499 case CEED_EVAL_CURL: 500 break; // TODO: Not implemented 501 // LCOV_EXCL_STOP 502 } 503 } 504 505 // Output restriction 506 for (CeedInt i = 0; i < numoutputfields; i++) { 507 // Restore evec 508 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 509 CeedChkBackend(ierr); 510 if (emode == CEED_EVAL_NONE) { 511 ierr = CeedVectorRestoreArray(impl->evecs[i+impl->numein], 512 &edata[i + numinputfields]); 513 CeedChkBackend(ierr); 514 } 515 // Get output vector 516 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); 517 CeedChkBackend(ierr); 518 // Restrict 519 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 520 CeedChkBackend(ierr); 521 // Active 522 if (vec == CEED_VECTOR_ACTIVE) 523 vec = outvec; 524 525 ierr = CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE, 526 impl->evecs[i + impl->numein], vec, 527 request); CeedChkBackend(ierr); 528 } 529 530 // Restore input arrays 531 ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields, 532 opinputfields, false, edata, impl); 533 CeedChkBackend(ierr); 534 return CEED_ERROR_SUCCESS; 535 } 536 537 //------------------------------------------------------------------------------ 538 // Core code for assembling linear QFunction 539 //------------------------------------------------------------------------------ 540 static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, 541 bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, 542 CeedRequest *request) { 543 int ierr; 544 CeedOperator_Cuda *impl; 545 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 546 CeedQFunction qf; 547 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 548 CeedInt Q, numelements, numinputfields, numoutputfields, size; 549 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 550 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 551 CeedChkBackend(ierr); 552 CeedOperatorField *opinputfields, *opoutputfields; 553 ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, 554 &numoutputfields, &opoutputfields); 555 CeedChkBackend(ierr); 556 CeedQFunctionField *qfinputfields, *qfoutputfields; 557 ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 558 CeedChkBackend(ierr); 559 CeedVector vec; 560 CeedInt numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout; 561 CeedVector *activein = impl->qfactivein; 562 CeedScalar *a, *tmp; 563 Ceed ceed, ceedparent; 564 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 565 ierr = CeedGetOperatorFallbackParentCeed(ceed, &ceedparent); 566 CeedChkBackend(ierr); 567 ceedparent = ceedparent ? ceedparent : ceed; 568 CeedScalar *edata[2*CEED_FIELD_MAX]; 569 570 // Setup 571 ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr); 572 573 // Check for identity 574 bool identityqf; 575 ierr = CeedQFunctionIsIdentity(qf, &identityqf); CeedChkBackend(ierr); 576 if (identityqf) 577 // LCOV_EXCL_START 578 return CeedError(ceed, CEED_ERROR_BACKEND, 579 "Assembling identity QFunctions not supported"); 580 // LCOV_EXCL_STOP 581 582 // Input Evecs and Restriction 583 ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields, 584 opinputfields, NULL, true, edata, 585 impl, request); CeedChkBackend(ierr); 586 587 // Count number of active input fields 588 if (!numactivein) { 589 for (CeedInt i=0; i<numinputfields; i++) { 590 // Get input vector 591 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 592 // Check if active input 593 if (vec == CEED_VECTOR_ACTIVE) { 594 ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr); 595 ierr = CeedVectorSetValue(impl->qvecsin[i], 0.0); CeedChkBackend(ierr); 596 ierr = CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp); 597 CeedChkBackend(ierr); 598 ierr = CeedRealloc(numactivein + size, &activein); CeedChkBackend(ierr); 599 for (CeedInt field = 0; field < size; field++) { 600 ierr = CeedVectorCreate(ceed, Q*numelements, 601 &activein[numactivein+field]); CeedChkBackend(ierr); 602 ierr = CeedVectorSetArray(activein[numactivein+field], CEED_MEM_DEVICE, 603 CEED_USE_POINTER, &tmp[field*Q*numelements]); 604 CeedChkBackend(ierr); 605 } 606 numactivein += size; 607 ierr = CeedVectorRestoreArray(impl->qvecsin[i], &tmp); CeedChkBackend(ierr); 608 } 609 } 610 impl->qfnumactivein = numactivein; 611 impl->qfactivein = activein; 612 } 613 614 // Count number of active output fields 615 if (!numactiveout) { 616 for (CeedInt i=0; i<numoutputfields; i++) { 617 // Get output vector 618 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); 619 CeedChkBackend(ierr); 620 // Check if active output 621 if (vec == CEED_VECTOR_ACTIVE) { 622 ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size); 623 CeedChkBackend(ierr); 624 numactiveout += size; 625 } 626 } 627 impl->qfnumactiveout = numactiveout; 628 } 629 630 // Check sizes 631 if (!numactivein || !numactiveout) 632 // LCOV_EXCL_START 633 return CeedError(ceed, CEED_ERROR_BACKEND, 634 "Cannot assemble QFunction without active inputs " 635 "and outputs"); 636 // LCOV_EXCL_STOP 637 638 // Build objects if needed 639 if (build_objects) { 640 // Create output restriction 641 CeedInt strides[3] = {1, numelements*Q, Q}; /* *NOPAD* */ 642 ierr = CeedElemRestrictionCreateStrided(ceedparent, numelements, Q, 643 numactivein*numactiveout, 644 numactivein*numactiveout*numelements*Q, 645 strides, rstr); CeedChkBackend(ierr); 646 // Create assembled vector 647 ierr = CeedVectorCreate(ceedparent, numelements*Q*numactivein*numactiveout, 648 assembled); CeedChkBackend(ierr); 649 } 650 ierr = CeedVectorSetValue(*assembled, 0.0); CeedChkBackend(ierr); 651 ierr = CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &a); 652 CeedChkBackend(ierr); 653 654 // Input basis apply 655 ierr = CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields, 656 numinputfields, true, edata, impl); 657 CeedChkBackend(ierr); 658 659 // Assemble QFunction 660 for (CeedInt in=0; in<numactivein; in++) { 661 // Set Inputs 662 ierr = CeedVectorSetValue(activein[in], 1.0); CeedChkBackend(ierr); 663 if (numactivein > 1) { 664 ierr = CeedVectorSetValue(activein[(in+numactivein-1)%numactivein], 665 0.0); CeedChkBackend(ierr); 666 } 667 // Set Outputs 668 for (CeedInt out=0; out<numoutputfields; out++) { 669 // Get output vector 670 ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec); 671 CeedChkBackend(ierr); 672 // Check if active output 673 if (vec == CEED_VECTOR_ACTIVE) { 674 CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE, 675 CEED_USE_POINTER, a); CeedChkBackend(ierr); 676 ierr = CeedQFunctionFieldGetSize(qfoutputfields[out], &size); 677 CeedChkBackend(ierr); 678 a += size*Q*numelements; // Advance the pointer by the size of the output 679 } 680 } 681 // Apply QFunction 682 ierr = CeedQFunctionApply(qf, Q*numelements, impl->qvecsin, impl->qvecsout); 683 CeedChkBackend(ierr); 684 } 685 686 // Un-set output Qvecs to prevent accidental overwrite of Assembled 687 for (CeedInt out=0; out<numoutputfields; out++) { 688 // Get output vector 689 ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec); 690 CeedChkBackend(ierr); 691 // Check if active output 692 if (vec == CEED_VECTOR_ACTIVE) { 693 ierr = CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL); 694 CeedChkBackend(ierr); 695 } 696 } 697 698 // Restore input arrays 699 ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields, 700 opinputfields, true, edata, impl); 701 CeedChkBackend(ierr); 702 703 // Restore output 704 ierr = CeedVectorRestoreArray(*assembled, &a); CeedChkBackend(ierr); 705 706 return CEED_ERROR_SUCCESS; 707 } 708 709 //------------------------------------------------------------------------------ 710 // Assemble Linear QFunction 711 //------------------------------------------------------------------------------ 712 static int CeedOperatorLinearAssembleQFunction_Cuda(CeedOperator op, 713 CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { 714 return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, true, assembled, rstr, 715 request); 716 } 717 718 //------------------------------------------------------------------------------ 719 // Update Assembled Linear QFunction 720 //------------------------------------------------------------------------------ 721 static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op, 722 CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { 723 return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, false, &assembled, 724 &rstr, request); 725 } 726 727 //------------------------------------------------------------------------------ 728 // Diagonal assembly kernels 729 //------------------------------------------------------------------------------ 730 // *INDENT-OFF* 731 static const char *diagonalkernels = QUOTE( 732 733 typedef enum { 734 /// Perform no evaluation (either because there is no data or it is already at 735 /// quadrature points) 736 CEED_EVAL_NONE = 0, 737 /// Interpolate from nodes to quadrature points 738 CEED_EVAL_INTERP = 1, 739 /// Evaluate gradients at quadrature points from input in a nodal basis 740 CEED_EVAL_GRAD = 2, 741 /// Evaluate divergence at quadrature points from input in a nodal basis 742 CEED_EVAL_DIV = 4, 743 /// Evaluate curl at quadrature points from input in a nodal basis 744 CEED_EVAL_CURL = 8, 745 /// Using no input, evaluate quadrature weights on the reference element 746 CEED_EVAL_WEIGHT = 16, 747 } CeedEvalMode; 748 749 //------------------------------------------------------------------------------ 750 // Get Basis Emode Pointer 751 //------------------------------------------------------------------------------ 752 extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **basisptr, 753 CeedEvalMode emode, const CeedScalar *identity, const CeedScalar *interp, 754 const CeedScalar *grad) { 755 switch (emode) { 756 case CEED_EVAL_NONE: 757 *basisptr = identity; 758 break; 759 case CEED_EVAL_INTERP: 760 *basisptr = interp; 761 break; 762 case CEED_EVAL_GRAD: 763 *basisptr = grad; 764 break; 765 case CEED_EVAL_WEIGHT: 766 case CEED_EVAL_DIV: 767 case CEED_EVAL_CURL: 768 break; // Caught by QF Assembly 769 } 770 } 771 772 //------------------------------------------------------------------------------ 773 // Core code for diagonal assembly 774 //------------------------------------------------------------------------------ 775 __device__ void diagonalCore(const CeedInt nelem, 776 const CeedScalar maxnorm, const bool pointBlock, 777 const CeedScalar *identity, 778 const CeedScalar *interpin, const CeedScalar *gradin, 779 const CeedScalar *interpout, const CeedScalar *gradout, 780 const CeedEvalMode *emodein, const CeedEvalMode *emodeout, 781 const CeedScalar *__restrict__ assembledqfarray, 782 CeedScalar *__restrict__ elemdiagarray) { 783 const int tid = threadIdx.x; // running with P threads, tid is evec node 784 const CeedScalar qfvaluebound = maxnorm*1e-12; 785 786 // Compute the diagonal of B^T D B 787 // Each element 788 for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < nelem; 789 e += gridDim.x*blockDim.z) { 790 CeedInt dout = -1; 791 // Each basis eval mode pair 792 for (CeedInt eout = 0; eout < NUMEMODEOUT; eout++) { 793 const CeedScalar *bt = NULL; 794 if (emodeout[eout] == CEED_EVAL_GRAD) 795 dout += 1; 796 CeedOperatorGetBasisPointer_Cuda(&bt, emodeout[eout], identity, interpout, 797 &gradout[dout*NQPTS*NNODES]); 798 CeedInt din = -1; 799 for (CeedInt ein = 0; ein < NUMEMODEIN; ein++) { 800 const CeedScalar *b = NULL; 801 if (emodein[ein] == CEED_EVAL_GRAD) 802 din += 1; 803 CeedOperatorGetBasisPointer_Cuda(&b, emodein[ein], identity, interpin, 804 &gradin[din*NQPTS*NNODES]); 805 // Each component 806 for (CeedInt compOut = 0; compOut < NCOMP; compOut++) { 807 // Each qpoint/node pair 808 if (pointBlock) { 809 // Point Block Diagonal 810 for (CeedInt compIn = 0; compIn < NCOMP; compIn++) { 811 CeedScalar evalue = 0.; 812 for (CeedInt q = 0; q < NQPTS; q++) { 813 const CeedScalar qfvalue = 814 assembledqfarray[((((ein*NCOMP+compIn)*NUMEMODEOUT+eout)* 815 NCOMP+compOut)*nelem+e)*NQPTS+q]; 816 if (abs(qfvalue) > qfvaluebound) 817 evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid]; 818 } 819 elemdiagarray[((compOut*NCOMP+compIn)*nelem+e)*NNODES+tid] += evalue; 820 } 821 } else { 822 // Diagonal Only 823 CeedScalar evalue = 0.; 824 for (CeedInt q = 0; q < NQPTS; q++) { 825 const CeedScalar qfvalue = 826 assembledqfarray[((((ein*NCOMP+compOut)*NUMEMODEOUT+eout)* 827 NCOMP+compOut)*nelem+e)*NQPTS+q]; 828 if (abs(qfvalue) > qfvaluebound) 829 evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid]; 830 } 831 elemdiagarray[(compOut*nelem+e)*NNODES+tid] += evalue; 832 } 833 } 834 } 835 } 836 } 837 } 838 839 //------------------------------------------------------------------------------ 840 // Linear diagonal 841 //------------------------------------------------------------------------------ 842 extern "C" __global__ void linearDiagonal(const CeedInt nelem, 843 const CeedScalar maxnorm, const CeedScalar *identity, 844 const CeedScalar *interpin, const CeedScalar *gradin, 845 const CeedScalar *interpout, const CeedScalar *gradout, 846 const CeedEvalMode *emodein, const CeedEvalMode *emodeout, 847 const CeedScalar *__restrict__ assembledqfarray, 848 CeedScalar *__restrict__ elemdiagarray) { 849 diagonalCore(nelem, maxnorm, false, identity, interpin, gradin, interpout, 850 gradout, emodein, emodeout, assembledqfarray, elemdiagarray); 851 } 852 853 //------------------------------------------------------------------------------ 854 // Linear point block diagonal 855 //------------------------------------------------------------------------------ 856 extern "C" __global__ void linearPointBlockDiagonal(const CeedInt nelem, 857 const CeedScalar maxnorm, const CeedScalar *identity, 858 const CeedScalar *interpin, const CeedScalar *gradin, 859 const CeedScalar *interpout, const CeedScalar *gradout, 860 const CeedEvalMode *emodein, const CeedEvalMode *emodeout, 861 const CeedScalar *__restrict__ assembledqfarray, 862 CeedScalar *__restrict__ elemdiagarray) { 863 diagonalCore(nelem, maxnorm, true, identity, interpin, gradin, interpout, 864 gradout, emodein, emodeout, assembledqfarray, elemdiagarray); 865 } 866 867 ); 868 // *INDENT-ON* 869 870 //------------------------------------------------------------------------------ 871 // Create point block restriction 872 //------------------------------------------------------------------------------ 873 static int CreatePBRestriction(CeedElemRestriction rstr, 874 CeedElemRestriction *pbRstr) { 875 int ierr; 876 Ceed ceed; 877 ierr = CeedElemRestrictionGetCeed(rstr, &ceed); CeedChkBackend(ierr); 878 const CeedInt *offsets; 879 ierr = CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets); 880 CeedChkBackend(ierr); 881 882 // Expand offsets 883 CeedInt nelem, ncomp, elemsize, compstride, max = 1, *pbOffsets; 884 ierr = CeedElemRestrictionGetNumElements(rstr, &nelem); CeedChkBackend(ierr); 885 ierr = CeedElemRestrictionGetNumComponents(rstr, &ncomp); CeedChkBackend(ierr); 886 ierr = CeedElemRestrictionGetElementSize(rstr, &elemsize); CeedChkBackend(ierr); 887 ierr = CeedElemRestrictionGetCompStride(rstr, &compstride); 888 CeedChkBackend(ierr); 889 CeedInt shift = ncomp; 890 if (compstride != 1) 891 shift *= ncomp; 892 ierr = CeedCalloc(nelem*elemsize, &pbOffsets); CeedChkBackend(ierr); 893 for (CeedInt i = 0; i < nelem*elemsize; i++) { 894 pbOffsets[i] = offsets[i]*shift; 895 if (pbOffsets[i] > max) 896 max = pbOffsets[i]; 897 } 898 899 // Create new restriction 900 ierr = CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp*ncomp, 1, 901 max + ncomp*ncomp, CEED_MEM_HOST, 902 CEED_OWN_POINTER, pbOffsets, pbRstr); 903 CeedChkBackend(ierr); 904 905 // Cleanup 906 ierr = CeedElemRestrictionRestoreOffsets(rstr, &offsets); CeedChkBackend(ierr); 907 908 return CEED_ERROR_SUCCESS; 909 } 910 911 //------------------------------------------------------------------------------ 912 // Assemble diagonal setup 913 //------------------------------------------------------------------------------ 914 static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, 915 const bool pointBlock) { 916 int ierr; 917 Ceed ceed; 918 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 919 CeedQFunction qf; 920 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 921 CeedInt numinputfields, numoutputfields; 922 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 923 CeedChkBackend(ierr); 924 925 // Determine active input basis 926 CeedOperatorField *opfields; 927 CeedQFunctionField *qffields; 928 ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL); 929 CeedChkBackend(ierr); 930 ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL); 931 CeedChkBackend(ierr); 932 CeedInt numemodein = 0, ncomp = 0, dim = 1; 933 CeedEvalMode *emodein = NULL; 934 CeedBasis basisin = NULL; 935 CeedElemRestriction rstrin = NULL; 936 for (CeedInt i = 0; i < numinputfields; i++) { 937 CeedVector vec; 938 ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr); 939 if (vec == CEED_VECTOR_ACTIVE) { 940 CeedElemRestriction rstr; 941 ierr = CeedOperatorFieldGetBasis(opfields[i], &basisin); CeedChkBackend(ierr); 942 ierr = CeedBasisGetNumComponents(basisin, &ncomp); CeedChkBackend(ierr); 943 ierr = CeedBasisGetDimension(basisin, &dim); CeedChkBackend(ierr); 944 ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr); 945 CeedChkBackend(ierr); 946 if (rstrin && rstrin != rstr) 947 // LCOV_EXCL_START 948 return CeedError(ceed, CEED_ERROR_BACKEND, 949 "Multi-field non-composite operator diagonal assembly not supported"); 950 // LCOV_EXCL_STOP 951 rstrin = rstr; 952 CeedEvalMode emode; 953 ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); 954 CeedChkBackend(ierr); 955 switch (emode) { 956 case CEED_EVAL_NONE: 957 case CEED_EVAL_INTERP: 958 ierr = CeedRealloc(numemodein + 1, &emodein); CeedChkBackend(ierr); 959 emodein[numemodein] = emode; 960 numemodein += 1; 961 break; 962 case CEED_EVAL_GRAD: 963 ierr = CeedRealloc(numemodein + dim, &emodein); CeedChkBackend(ierr); 964 for (CeedInt d = 0; d < dim; d++) 965 emodein[numemodein+d] = emode; 966 numemodein += dim; 967 break; 968 case CEED_EVAL_WEIGHT: 969 case CEED_EVAL_DIV: 970 case CEED_EVAL_CURL: 971 break; // Caught by QF Assembly 972 } 973 } 974 } 975 976 // Determine active output basis 977 ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields); 978 CeedChkBackend(ierr); 979 ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields); 980 CeedChkBackend(ierr); 981 CeedInt numemodeout = 0; 982 CeedEvalMode *emodeout = NULL; 983 CeedBasis basisout = NULL; 984 CeedElemRestriction rstrout = NULL; 985 for (CeedInt i = 0; i < numoutputfields; i++) { 986 CeedVector vec; 987 ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr); 988 if (vec == CEED_VECTOR_ACTIVE) { 989 CeedElemRestriction rstr; 990 ierr = CeedOperatorFieldGetBasis(opfields[i], &basisout); CeedChkBackend(ierr); 991 ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr); 992 CeedChkBackend(ierr); 993 if (rstrout && rstrout != rstr) 994 // LCOV_EXCL_START 995 return CeedError(ceed, CEED_ERROR_BACKEND, 996 "Multi-field non-composite operator diagonal assembly not supported"); 997 // LCOV_EXCL_STOP 998 rstrout = rstr; 999 CeedEvalMode emode; 1000 ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr); 1001 switch (emode) { 1002 case CEED_EVAL_NONE: 1003 case CEED_EVAL_INTERP: 1004 ierr = CeedRealloc(numemodeout + 1, &emodeout); CeedChkBackend(ierr); 1005 emodeout[numemodeout] = emode; 1006 numemodeout += 1; 1007 break; 1008 case CEED_EVAL_GRAD: 1009 ierr = CeedRealloc(numemodeout + dim, &emodeout); CeedChkBackend(ierr); 1010 for (CeedInt d = 0; d < dim; d++) 1011 emodeout[numemodeout+d] = emode; 1012 numemodeout += dim; 1013 break; 1014 case CEED_EVAL_WEIGHT: 1015 case CEED_EVAL_DIV: 1016 case CEED_EVAL_CURL: 1017 break; // Caught by QF Assembly 1018 } 1019 } 1020 } 1021 1022 // Operator data struct 1023 CeedOperator_Cuda *impl; 1024 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 1025 ierr = CeedCalloc(1, &impl->diag); CeedChkBackend(ierr); 1026 CeedOperatorDiag_Cuda *diag = impl->diag; 1027 diag->basisin = basisin; 1028 diag->basisout = basisout; 1029 diag->h_emodein = emodein; 1030 diag->h_emodeout = emodeout; 1031 diag->numemodein = numemodein; 1032 diag->numemodeout = numemodeout; 1033 1034 // Assemble kernel 1035 CeedInt nnodes, nqpts; 1036 ierr = CeedBasisGetNumNodes(basisin, &nnodes); CeedChkBackend(ierr); 1037 ierr = CeedBasisGetNumQuadraturePoints(basisin, &nqpts); CeedChkBackend(ierr); 1038 diag->nnodes = nnodes; 1039 ierr = CeedCompileCuda(ceed, diagonalkernels, &diag->module, 5, 1040 "NUMEMODEIN", numemodein, 1041 "NUMEMODEOUT", numemodeout, 1042 "NNODES", nnodes, 1043 "NQPTS", nqpts, 1044 "NCOMP", ncomp 1045 ); CeedChk_Cu(ceed, ierr); 1046 ierr = CeedGetKernelCuda(ceed, diag->module, "linearDiagonal", 1047 &diag->linearDiagonal); CeedChk_Cu(ceed, ierr); 1048 ierr = CeedGetKernelCuda(ceed, diag->module, "linearPointBlockDiagonal", 1049 &diag->linearPointBlock); 1050 CeedChk_Cu(ceed, ierr); 1051 1052 // Basis matrices 1053 const CeedInt qBytes = nqpts * sizeof(CeedScalar); 1054 const CeedInt iBytes = qBytes * nnodes; 1055 const CeedInt gBytes = qBytes * nnodes * dim; 1056 const CeedInt eBytes = sizeof(CeedEvalMode); 1057 const CeedScalar *interpin, *interpout, *gradin, *gradout; 1058 1059 // CEED_EVAL_NONE 1060 CeedScalar *identity = NULL; 1061 bool evalNone = false; 1062 for (CeedInt i=0; i<numemodein; i++) 1063 evalNone = evalNone || (emodein[i] == CEED_EVAL_NONE); 1064 for (CeedInt i=0; i<numemodeout; i++) 1065 evalNone = evalNone || (emodeout[i] == CEED_EVAL_NONE); 1066 if (evalNone) { 1067 ierr = CeedCalloc(nqpts*nnodes, &identity); CeedChkBackend(ierr); 1068 for (CeedInt i=0; i<(nnodes<nqpts?nnodes:nqpts); i++) 1069 identity[i*nnodes+i] = 1.0; 1070 ierr = cudaMalloc((void **)&diag->d_identity, iBytes); CeedChk_Cu(ceed, ierr); 1071 ierr = cudaMemcpy(diag->d_identity, identity, iBytes, 1072 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1073 } 1074 1075 // CEED_EVAL_INTERP 1076 ierr = CeedBasisGetInterp(basisin, &interpin); CeedChkBackend(ierr); 1077 ierr = cudaMalloc((void **)&diag->d_interpin, iBytes); CeedChk_Cu(ceed, ierr); 1078 ierr = cudaMemcpy(diag->d_interpin, interpin, iBytes, 1079 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1080 ierr = CeedBasisGetInterp(basisout, &interpout); CeedChkBackend(ierr); 1081 ierr = cudaMalloc((void **)&diag->d_interpout, iBytes); CeedChk_Cu(ceed, ierr); 1082 ierr = cudaMemcpy(diag->d_interpout, interpout, iBytes, 1083 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1084 1085 // CEED_EVAL_GRAD 1086 ierr = CeedBasisGetGrad(basisin, &gradin); CeedChkBackend(ierr); 1087 ierr = cudaMalloc((void **)&diag->d_gradin, gBytes); CeedChk_Cu(ceed, ierr); 1088 ierr = cudaMemcpy(diag->d_gradin, gradin, gBytes, 1089 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1090 ierr = CeedBasisGetGrad(basisout, &gradout); CeedChkBackend(ierr); 1091 ierr = cudaMalloc((void **)&diag->d_gradout, gBytes); CeedChk_Cu(ceed, ierr); 1092 ierr = cudaMemcpy(diag->d_gradout, gradout, gBytes, 1093 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1094 1095 // Arrays of emodes 1096 ierr = cudaMalloc((void **)&diag->d_emodein, numemodein * eBytes); 1097 CeedChk_Cu(ceed, ierr); 1098 ierr = cudaMemcpy(diag->d_emodein, emodein, numemodein * eBytes, 1099 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1100 ierr = cudaMalloc((void **)&diag->d_emodeout, numemodeout * eBytes); 1101 CeedChk_Cu(ceed, ierr); 1102 ierr = cudaMemcpy(diag->d_emodeout, emodeout, numemodeout * eBytes, 1103 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1104 1105 // Restriction 1106 diag->diagrstr = rstrout; 1107 1108 return CEED_ERROR_SUCCESS; 1109 } 1110 1111 //------------------------------------------------------------------------------ 1112 // Assemble diagonal common code 1113 //------------------------------------------------------------------------------ 1114 static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, 1115 CeedVector assembled, CeedRequest *request, const bool pointBlock) { 1116 int ierr; 1117 Ceed ceed; 1118 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1119 CeedOperator_Cuda *impl; 1120 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 1121 1122 // Assemble QFunction 1123 CeedVector assembledqf; 1124 CeedElemRestriction rstr; 1125 ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, 1126 &rstr, request); CeedChkBackend(ierr); 1127 ierr = CeedElemRestrictionDestroy(&rstr); CeedChkBackend(ierr); 1128 CeedScalar maxnorm = 0; 1129 ierr = CeedVectorNorm(assembledqf, CEED_NORM_MAX, &maxnorm); 1130 CeedChkBackend(ierr); 1131 1132 // Setup 1133 if (!impl->diag) { 1134 ierr = CeedOperatorAssembleDiagonalSetup_Cuda(op, pointBlock); 1135 CeedChkBackend(ierr); 1136 } 1137 CeedOperatorDiag_Cuda *diag = impl->diag; 1138 assert(diag != NULL); 1139 1140 // Restriction 1141 if (pointBlock && !diag->pbdiagrstr) { 1142 CeedElemRestriction pbdiagrstr; 1143 ierr = CreatePBRestriction(diag->diagrstr, &pbdiagrstr); CeedChkBackend(ierr); 1144 diag->pbdiagrstr = pbdiagrstr; 1145 } 1146 CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr; 1147 1148 // Create diagonal vector 1149 CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag; 1150 if (!elemdiag) { 1151 ierr = CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag); 1152 CeedChkBackend(ierr); 1153 if (pointBlock) 1154 diag->pbelemdiag = elemdiag; 1155 else 1156 diag->elemdiag = elemdiag; 1157 } 1158 ierr = CeedVectorSetValue(elemdiag, 0.0); CeedChkBackend(ierr); 1159 1160 // Assemble element operator diagonals 1161 CeedScalar *elemdiagarray; 1162 const CeedScalar *assembledqfarray; 1163 ierr = CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray); 1164 CeedChkBackend(ierr); 1165 ierr = CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray); 1166 CeedChkBackend(ierr); 1167 CeedInt nelem; 1168 ierr = CeedElemRestrictionGetNumElements(diagrstr, &nelem); 1169 CeedChkBackend(ierr); 1170 1171 // Compute the diagonal of B^T D B 1172 int elemsPerBlock = 1; 1173 int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0); 1174 void *args[] = {(void *) &nelem, (void *) &maxnorm, &diag->d_identity, 1175 &diag->d_interpin, &diag->d_gradin, &diag->d_interpout, 1176 &diag->d_gradout, &diag->d_emodein, &diag->d_emodeout, 1177 &assembledqfarray, &elemdiagarray 1178 }; 1179 if (pointBlock) { 1180 ierr = CeedRunKernelDimCuda(ceed, diag->linearPointBlock, grid, 1181 diag->nnodes, 1, elemsPerBlock, args); 1182 CeedChkBackend(ierr); 1183 } else { 1184 ierr = CeedRunKernelDimCuda(ceed, diag->linearDiagonal, grid, 1185 diag->nnodes, 1, elemsPerBlock, args); 1186 CeedChkBackend(ierr); 1187 } 1188 1189 // Restore arrays 1190 ierr = CeedVectorRestoreArray(elemdiag, &elemdiagarray); CeedChkBackend(ierr); 1191 ierr = CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray); 1192 CeedChkBackend(ierr); 1193 1194 // Assemble local operator diagonal 1195 ierr = CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag, 1196 assembled, request); CeedChkBackend(ierr); 1197 1198 // Cleanup 1199 ierr = CeedVectorDestroy(&assembledqf); CeedChkBackend(ierr); 1200 1201 return CEED_ERROR_SUCCESS; 1202 } 1203 1204 //------------------------------------------------------------------------------ 1205 // Assemble composite diagonal common code 1206 //------------------------------------------------------------------------------ 1207 static inline int CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda( 1208 CeedOperator op, CeedVector assembled, CeedRequest *request, 1209 const bool pointBlock) { 1210 int ierr; 1211 CeedInt numSub; 1212 CeedOperator *subOperators; 1213 ierr = CeedOperatorGetNumSub(op, &numSub); CeedChkBackend(ierr); 1214 ierr = CeedOperatorGetSubList(op, &subOperators); CeedChkBackend(ierr); 1215 for (CeedInt i = 0; i < numSub; i++) { 1216 ierr = CeedOperatorAssembleDiagonalCore_Cuda(subOperators[i], assembled, 1217 request, pointBlock); CeedChkBackend(ierr); 1218 } 1219 return CEED_ERROR_SUCCESS; 1220 } 1221 1222 //------------------------------------------------------------------------------ 1223 // Assemble Linear Diagonal 1224 //------------------------------------------------------------------------------ 1225 static int CeedOperatorLinearAssembleAddDiagonal_Cuda(CeedOperator op, 1226 CeedVector assembled, CeedRequest *request) { 1227 int ierr; 1228 bool isComposite; 1229 ierr = CeedOperatorIsComposite(op, &isComposite); CeedChkBackend(ierr); 1230 if (isComposite) { 1231 return CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda(op, assembled, 1232 request, false); 1233 } else { 1234 return CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, false); 1235 } 1236 } 1237 1238 //------------------------------------------------------------------------------ 1239 // Assemble Linear Point Block Diagonal 1240 //------------------------------------------------------------------------------ 1241 static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op, 1242 CeedVector assembled, CeedRequest *request) { 1243 int ierr; 1244 bool isComposite; 1245 ierr = CeedOperatorIsComposite(op, &isComposite); CeedChkBackend(ierr); 1246 if (isComposite) { 1247 return CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda(op, assembled, 1248 request, true); 1249 } else { 1250 return CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, true); 1251 } 1252 } 1253 1254 //------------------------------------------------------------------------------ 1255 // Matrix assembly kernel for low-order elements (2D thread block) 1256 //------------------------------------------------------------------------------ 1257 // *INDENT-OFF* 1258 static const char *assemblykernel = QUOTE( 1259 extern "C" __launch_bounds__(BLOCK_SIZE) 1260 __global__ void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out, 1261 const CeedScalar *__restrict__ qf_array, 1262 CeedScalar *__restrict__ values_array) { 1263 1264 // This kernel assumes B_in and B_out have the same number of quadrature points and 1265 // basis points. 1266 // TODO: expand to more general cases 1267 const int i = threadIdx.x; // The output row index of each B^TDB operation 1268 const int l = threadIdx.y; // The output column index of each B^TDB operation 1269 // such that we have (Bout^T)_ij D_jk Bin_kl = C_il 1270 1271 // Strides for final output ordering, determined by the reference (interface) implementation of 1272 // the symbolic assembly, slowest --> fastest: element, comp_in, comp_out, node_row, node_col 1273 const CeedInt comp_out_stride = NNODES * NNODES; 1274 const CeedInt comp_in_stride = comp_out_stride * NCOMP; 1275 const CeedInt e_stride = comp_in_stride * NCOMP; 1276 // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt 1277 const CeedInt qe_stride = NQPTS; 1278 const CeedInt qcomp_out_stride = NELEM * qe_stride; 1279 const CeedInt qemode_out_stride = qcomp_out_stride * NCOMP; 1280 const CeedInt qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; 1281 const CeedInt qemode_in_stride = qcomp_in_stride * NCOMP; 1282 1283 // Loop over each element (if necessary) 1284 for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < NELEM; 1285 e += gridDim.x*blockDim.z) { 1286 for (CeedInt comp_in = 0; comp_in < NCOMP; comp_in++) { 1287 for (CeedInt comp_out = 0; comp_out < NCOMP; comp_out++) { 1288 CeedScalar result = 0.0; 1289 CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; 1290 for (CeedInt emode_in = 0; emode_in < NUMEMODEIN; emode_in++) { 1291 CeedInt b_in_index = emode_in * NQPTS * NNODES; 1292 for (CeedInt emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { 1293 CeedInt b_out_index = emode_out * NQPTS * NNODES; 1294 CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; 1295 // Perform the B^T D B operation for this 'chunk' of D (the qf_array) 1296 for (CeedInt j = 0; j < NQPTS; j++) { 1297 result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; 1298 } 1299 1300 }// end of emode_out 1301 } // end of emode_in 1302 CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; 1303 values_array[val_index] = result; 1304 } // end of out component 1305 } // end of in component 1306 } // end of element loop 1307 } 1308 ); 1309 1310 //------------------------------------------------------------------------------ 1311 // Fallback kernel for larger orders (1D thread block) 1312 //------------------------------------------------------------------------------ 1313 static const char *assemblykernelbigelem = QUOTE( 1314 extern "C" __launch_bounds__(BLOCK_SIZE) 1315 __global__ void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out, 1316 const CeedScalar *__restrict__ qf_array, 1317 CeedScalar *__restrict__ values_array) { 1318 1319 // This kernel assumes B_in and B_out have the same number of quadrature points and 1320 // basis points. 1321 // TODO: expand to more general cases 1322 const int l = threadIdx.x; // The output column index of each B^TDB operation 1323 // such that we have (Bout^T)_ij D_jk Bin_kl = C_il 1324 1325 // Strides for final output ordering, determined by the reference (interface) implementation of 1326 // the symbolic assembly, slowest --> fastest: element, comp_in, comp_out, node_row, node_col 1327 const CeedInt comp_out_stride = NNODES * NNODES; 1328 const CeedInt comp_in_stride = comp_out_stride * NCOMP; 1329 const CeedInt e_stride = comp_in_stride * NCOMP; 1330 // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt 1331 const CeedInt qe_stride = NQPTS; 1332 const CeedInt qcomp_out_stride = NELEM * qe_stride; 1333 const CeedInt qemode_out_stride = qcomp_out_stride * NCOMP; 1334 const CeedInt qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; 1335 const CeedInt qemode_in_stride = qcomp_in_stride * NCOMP; 1336 1337 // Loop over each element (if necessary) 1338 for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < NELEM; 1339 e += gridDim.x*blockDim.z) { 1340 for (CeedInt comp_in = 0; comp_in < NCOMP; comp_in++) { 1341 for (CeedInt comp_out = 0; comp_out < NCOMP; comp_out++) { 1342 for (CeedInt i = 0; i < NNODES; i++) { 1343 CeedScalar result = 0.0; 1344 CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; 1345 for (CeedInt emode_in = 0; emode_in < NUMEMODEIN; emode_in++) { 1346 CeedInt b_in_index = emode_in * NQPTS * NNODES; 1347 for (CeedInt emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { 1348 CeedInt b_out_index = emode_out * NQPTS * NNODES; 1349 CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; 1350 // Perform the B^T D B operation for this 'chunk' of D (the qf_array) 1351 for (CeedInt j = 0; j < NQPTS; j++) { 1352 result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; 1353 } 1354 1355 }// end of emode_out 1356 } // end of emode_in 1357 CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; 1358 values_array[val_index] = result; 1359 } // end of loop over element node index, i 1360 } // end of out component 1361 } // end of in component 1362 } // end of element loop 1363 } 1364 ); 1365 // *INDENT-ON* 1366 1367 //------------------------------------------------------------------------------ 1368 // Single operator assembly setup 1369 //------------------------------------------------------------------------------ 1370 static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op) { 1371 int ierr; 1372 Ceed ceed; 1373 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1374 CeedOperator_Cuda *impl; 1375 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 1376 1377 // Get intput and output fields 1378 CeedInt num_input_fields, num_output_fields; 1379 CeedOperatorField *input_fields; 1380 CeedOperatorField *output_fields; 1381 ierr = CeedOperatorGetFields(op, &num_input_fields, &input_fields, 1382 &num_output_fields, &output_fields); CeedChk(ierr); 1383 1384 // Determine active input basis eval mode 1385 CeedQFunction qf; 1386 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 1387 CeedQFunctionField *qf_fields; 1388 ierr = CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL); CeedChk(ierr); 1389 // Note that the kernel will treat each dimension of a gradient action separately; 1390 // i.e., when an active input has a CEED_EVAL_GRAD mode, num_emode_in will increment 1391 // by dim. However, for the purposes of loading the B matrices, it will be treated 1392 // as one mode, and we will load/copy the entire gradient matrix at once, so 1393 // num_B_in_mats_to_load will be incremented by 1. 1394 CeedInt num_emode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0; 1395 CeedEvalMode *eval_mode_in = NULL; //will be of size num_B_in_mats_load 1396 CeedBasis basis_in = NULL; 1397 CeedInt nqpts, esize; 1398 CeedElemRestriction rstr_in = NULL; 1399 for (CeedInt i=0; i<num_input_fields; i++) { 1400 CeedVector vec; 1401 ierr = CeedOperatorFieldGetVector(input_fields[i], &vec); CeedChk(ierr); 1402 if (vec == CEED_VECTOR_ACTIVE) { 1403 ierr = CeedOperatorFieldGetBasis(input_fields[i], &basis_in); 1404 CeedChk(ierr); 1405 ierr = CeedBasisGetDimension(basis_in, &dim); CeedChk(ierr); 1406 ierr = CeedBasisGetNumQuadraturePoints(basis_in, &nqpts); CeedChk(ierr); 1407 ierr = CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in); 1408 ierr = CeedElemRestrictionGetElementSize(rstr_in, &esize); CeedChk(ierr); 1409 CeedChk(ierr); 1410 CeedEvalMode eval_mode; 1411 ierr = CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode); 1412 CeedChk(ierr); 1413 if (eval_mode != CEED_EVAL_NONE) { 1414 ierr = CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in); CeedChk(ierr); 1415 eval_mode_in[num_B_in_mats_to_load] = eval_mode; 1416 num_B_in_mats_to_load += 1; 1417 if (eval_mode == CEED_EVAL_GRAD) { 1418 num_emode_in += dim; 1419 size_B_in += dim * esize * nqpts; 1420 } else { 1421 num_emode_in +=1; 1422 size_B_in += esize * nqpts; 1423 } 1424 } 1425 } 1426 } 1427 1428 // Determine active output basis; basis_out and rstr_out only used if same as input, TODO 1429 ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields); CeedChk(ierr); 1430 CeedInt num_emode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0; 1431 CeedEvalMode *eval_mode_out = NULL; 1432 CeedBasis basis_out = NULL; 1433 CeedElemRestriction rstr_out = NULL; 1434 for (CeedInt i=0; i<num_output_fields; i++) { 1435 CeedVector vec; 1436 ierr = CeedOperatorFieldGetVector(output_fields[i], &vec); CeedChk(ierr); 1437 if (vec == CEED_VECTOR_ACTIVE) { 1438 ierr = CeedOperatorFieldGetBasis(output_fields[i], &basis_out); 1439 CeedChk(ierr); 1440 ierr = CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out); 1441 CeedChk(ierr); 1442 if (rstr_out && rstr_out != rstr_in) 1443 // LCOV_EXCL_START 1444 return CeedError(ceed, CEED_ERROR_BACKEND, 1445 "Multi-field non-composite operator assembly not supported"); 1446 // LCOV_EXCL_STOP 1447 CeedEvalMode eval_mode; 1448 ierr = CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode); 1449 CeedChk(ierr); 1450 if (eval_mode != CEED_EVAL_NONE) { 1451 ierr = CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out); CeedChk(ierr); 1452 eval_mode_out[num_B_out_mats_to_load] = eval_mode; 1453 num_B_out_mats_to_load += 1; 1454 if (eval_mode == CEED_EVAL_GRAD) { 1455 num_emode_out += dim; 1456 size_B_out += dim * esize * nqpts; 1457 } else { 1458 num_emode_out +=1; 1459 size_B_out += esize * nqpts; 1460 } 1461 } 1462 } 1463 } 1464 1465 if (num_emode_in == 0 || num_emode_out == 0) 1466 // LCOV_EXCL_START 1467 return CeedError(ceed, CEED_ERROR_UNSUPPORTED, 1468 "Cannot assemble operator without inputs/outputs"); 1469 // LCOV_EXCL_STOP 1470 1471 CeedInt nelem, ncomp; 1472 ierr = CeedElemRestrictionGetNumElements(rstr_in, &nelem); CeedChk(ierr); 1473 ierr = CeedElemRestrictionGetNumComponents(rstr_in, &ncomp); CeedChk(ierr); 1474 1475 ierr = CeedCalloc(1, &impl->asmb); CeedChkBackend(ierr); 1476 CeedOperatorAssemble_Cuda *asmb = impl->asmb; 1477 asmb->nelem = nelem; 1478 1479 // Compile kernels 1480 int elemsPerBlock = 1; 1481 asmb->elemsPerBlock = elemsPerBlock; 1482 CeedInt block_size = esize * esize * elemsPerBlock; 1483 Ceed_Cuda *cuda_data; 1484 ierr = CeedGetData(ceed, &cuda_data); CeedChkBackend(ierr); 1485 if (block_size > cuda_data->device_prop.maxThreadsPerBlock) { 1486 // Use fallback kernel with 1D threadblock 1487 block_size = esize * elemsPerBlock; 1488 asmb->block_size_x = esize; 1489 asmb->block_size_y = 1; 1490 ierr = CeedCompileCuda(ceed, assemblykernelbigelem, &asmb->module, 7, 1491 "NELEM", nelem, 1492 "NUMEMODEIN", num_emode_in, 1493 "NUMEMODEOUT", num_emode_out, 1494 "NQPTS", nqpts, 1495 "NNODES", esize, 1496 "BLOCK_SIZE", block_size, 1497 "NCOMP", ncomp 1498 ); CeedChk_Cu(ceed, ierr); 1499 } else { // Use kernel with 2D threadblock 1500 asmb->block_size_x = esize; 1501 asmb->block_size_y = esize; 1502 ierr = CeedCompileCuda(ceed, assemblykernel, &asmb->module, 7, 1503 "NELEM", nelem, 1504 "NUMEMODEIN", num_emode_in, 1505 "NUMEMODEOUT", num_emode_out, 1506 "NQPTS", nqpts, 1507 "NNODES", esize, 1508 "BLOCK_SIZE", block_size, 1509 "NCOMP", ncomp 1510 ); CeedChk_Cu(ceed, ierr); 1511 } 1512 ierr = CeedGetKernelCuda(ceed, asmb->module, "linearAssemble", 1513 &asmb->linearAssemble); CeedChk_Cu(ceed, ierr); 1514 1515 // Build 'full' B matrices (not 1D arrays used for tensor-product matrices) 1516 const CeedScalar *interp_in, *grad_in; 1517 ierr = CeedBasisGetInterp(basis_in, &interp_in); CeedChkBackend(ierr); 1518 ierr = CeedBasisGetGrad(basis_in, &grad_in); CeedChkBackend(ierr); 1519 1520 // Load into B_in, in order that they will be used in eval_mode 1521 const CeedInt inBytes = size_B_in * sizeof(CeedScalar); 1522 CeedInt mat_start = 0; 1523 ierr = cudaMalloc((void **) &asmb->d_B_in, inBytes); CeedChk_Cu(ceed, ierr); 1524 for (int i = 0; i < num_B_in_mats_to_load; i++) { 1525 CeedEvalMode eval_mode = eval_mode_in[i]; 1526 if (eval_mode == CEED_EVAL_INTERP) { 1527 ierr = cudaMemcpy(&asmb->d_B_in[mat_start], interp_in, 1528 esize * nqpts * sizeof(CeedScalar), 1529 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1530 mat_start += esize * nqpts; 1531 } else if (eval_mode == CEED_EVAL_GRAD) { 1532 ierr = cudaMemcpy(asmb->d_B_in, grad_in, 1533 dim * esize * nqpts * sizeof(CeedScalar), 1534 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1535 mat_start += dim * esize * nqpts; 1536 } 1537 } 1538 1539 const CeedScalar *interp_out, *grad_out; 1540 // Note that this function currently assumes 1 basis, so this should always be true 1541 // for now 1542 if (basis_out == basis_in) { 1543 interp_out = interp_in; 1544 grad_out = grad_in; 1545 } else { 1546 ierr = CeedBasisGetInterp(basis_out, &interp_out); CeedChkBackend(ierr); 1547 ierr = CeedBasisGetGrad(basis_out, &grad_out); CeedChkBackend(ierr); 1548 } 1549 1550 // Load into B_out, in order that they will be used in eval_mode 1551 const CeedInt outBytes = size_B_out * sizeof(CeedScalar); 1552 mat_start = 0; 1553 ierr = cudaMalloc((void **) &asmb->d_B_out, outBytes); CeedChk_Cu(ceed, ierr); 1554 for (int i = 0; i < num_B_out_mats_to_load; i++) { 1555 CeedEvalMode eval_mode = eval_mode_out[i]; 1556 if (eval_mode == CEED_EVAL_INTERP) { 1557 ierr = cudaMemcpy(&asmb->d_B_out[mat_start], interp_out, 1558 esize * nqpts * sizeof(CeedScalar), 1559 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1560 mat_start += esize * nqpts; 1561 } else if (eval_mode == CEED_EVAL_GRAD) { 1562 ierr = cudaMemcpy(&asmb->d_B_out[mat_start], grad_out, 1563 dim * esize * nqpts * sizeof(CeedScalar), 1564 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1565 mat_start += dim * esize * nqpts; 1566 } 1567 } 1568 return CEED_ERROR_SUCCESS; 1569 } 1570 1571 //------------------------------------------------------------------------------ 1572 // Single operator assembly 1573 //------------------------------------------------------------------------------ 1574 static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, 1575 CeedVector values) { 1576 1577 int ierr; 1578 Ceed ceed; 1579 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1580 CeedOperator_Cuda *impl; 1581 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 1582 1583 // Setup 1584 if (!impl->asmb) { 1585 ierr = CeedSingleOperatorAssembleSetup_Cuda(op); 1586 CeedChkBackend(ierr); 1587 } 1588 1589 // Assemble QFunction 1590 CeedVector assembled_qf; 1591 CeedElemRestriction rstr_q; 1592 ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate( 1593 op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE); CeedChk(ierr); 1594 ierr = CeedElemRestrictionDestroy(&rstr_q); CeedChkBackend(ierr); 1595 CeedScalar *values_array; 1596 ierr = CeedVectorGetArrayWrite(values, CEED_MEM_DEVICE, &values_array); 1597 CeedChkBackend(ierr); 1598 values_array += offset; 1599 const CeedScalar *qf_array; 1600 ierr = CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array); 1601 CeedChkBackend(ierr); 1602 1603 // Compute B^T D B 1604 const CeedInt nelem = impl->asmb->nelem; 1605 const CeedInt elemsPerBlock = impl->asmb->elemsPerBlock; 1606 const CeedInt grid = nelem/elemsPerBlock+(( 1607 nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0); 1608 void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out, 1609 &qf_array, &values_array 1610 }; 1611 ierr = CeedRunKernelDimCuda(ceed, impl->asmb->linearAssemble, grid, 1612 impl->asmb->block_size_x, impl->asmb->block_size_y, 1613 elemsPerBlock, args); 1614 CeedChkBackend(ierr); 1615 1616 1617 // Restore arrays 1618 ierr = CeedVectorRestoreArray(values, &values_array); CeedChkBackend(ierr); 1619 ierr = CeedVectorRestoreArrayRead(assembled_qf, &qf_array); 1620 CeedChkBackend(ierr); 1621 1622 // Cleanup 1623 ierr = CeedVectorDestroy(&assembled_qf); CeedChkBackend(ierr); 1624 1625 return CEED_ERROR_SUCCESS; 1626 } 1627 1628 //------------------------------------------------------------------------------ 1629 // Assemble matrix data for COO matrix of assembled operator. 1630 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic. 1631 // 1632 // Note that this (and other assembly routines) currently assume only one 1633 // active input restriction/basis per operator (could have multiple basis eval 1634 // modes). 1635 // TODO: allow multiple active input restrictions/basis objects 1636 //------------------------------------------------------------------------------ 1637 int CeedOperatorLinearAssemble_Cuda(CeedOperator op, CeedVector values) { 1638 1639 // As done in the default implementation, loop through suboperators 1640 // for composite operators, or call single operator assembly otherwise 1641 bool is_composite; 1642 CeedInt ierr; 1643 ierr = CeedOperatorIsComposite(op, &is_composite); CeedChk(ierr); 1644 1645 CeedElemRestriction rstr; 1646 CeedInt num_elem, elem_size, num_comp; 1647 1648 CeedInt offset = 0; 1649 if (is_composite) { 1650 CeedInt num_suboperators; 1651 ierr = CeedOperatorGetNumSub(op, &num_suboperators); CeedChk(ierr); 1652 CeedOperator *sub_operators; 1653 ierr = CeedOperatorGetSubList(op, &sub_operators); CeedChk(ierr); 1654 for (int k = 0; k < num_suboperators; ++k) { 1655 ierr = CeedSingleOperatorAssemble_Cuda(sub_operators[k], offset, values); 1656 CeedChk(ierr); 1657 ierr = CeedOperatorGetActiveElemRestriction(sub_operators[k], &rstr); 1658 CeedChk(ierr); 1659 ierr = CeedElemRestrictionGetNumElements(rstr, &num_elem); CeedChk(ierr); 1660 ierr = CeedElemRestrictionGetElementSize(rstr, &elem_size); CeedChk(ierr); 1661 ierr = CeedElemRestrictionGetNumComponents(rstr, &num_comp); CeedChk(ierr); 1662 offset += elem_size*num_comp * elem_size*num_comp * num_elem; 1663 } 1664 } else { 1665 ierr = CeedSingleOperatorAssemble_Cuda(op, offset, values); CeedChk(ierr); 1666 } 1667 1668 return CEED_ERROR_SUCCESS; 1669 } 1670 //------------------------------------------------------------------------------ 1671 // Create operator 1672 //------------------------------------------------------------------------------ 1673 int CeedOperatorCreate_Cuda(CeedOperator op) { 1674 int ierr; 1675 Ceed ceed; 1676 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1677 CeedOperator_Cuda *impl; 1678 1679 ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); 1680 ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr); 1681 1682 ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", 1683 CeedOperatorLinearAssembleQFunction_Cuda); 1684 CeedChkBackend(ierr); 1685 ierr = CeedSetBackendFunction(ceed, "Operator", op, 1686 "LinearAssembleQFunctionUpdate", 1687 CeedOperatorLinearAssembleQFunctionUpdate_Cuda); 1688 CeedChkBackend(ierr); 1689 ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", 1690 CeedOperatorLinearAssembleAddDiagonal_Cuda); 1691 CeedChkBackend(ierr); 1692 ierr = CeedSetBackendFunction(ceed, "Operator", op, 1693 "LinearAssembleAddPointBlockDiagonal", 1694 CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda); 1695 CeedChkBackend(ierr); 1696 ierr = CeedSetBackendFunction(ceed, "Operator", op, 1697 "LinearAssemble", CeedOperatorLinearAssemble_Cuda); 1698 CeedChkBackend(ierr); 1699 ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", 1700 CeedOperatorApplyAdd_Cuda); CeedChkBackend(ierr); 1701 ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", 1702 CeedOperatorDestroy_Cuda); CeedChkBackend(ierr); 1703 return CEED_ERROR_SUCCESS; 1704 } 1705 1706 //------------------------------------------------------------------------------ 1707 // Composite Operator Create 1708 //------------------------------------------------------------------------------ 1709 int CeedCompositeOperatorCreate_Cuda(CeedOperator op) { 1710 int ierr; 1711 Ceed ceed; 1712 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1713 1714 ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", 1715 CeedOperatorLinearAssembleAddDiagonal_Cuda); 1716 CeedChkBackend(ierr); 1717 ierr = CeedSetBackendFunction(ceed, "Operator", op, 1718 "LinearAssembleAddPointBlockDiagonal", 1719 CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda); 1720 CeedChkBackend(ierr); 1721 ierr = CeedSetBackendFunction(ceed, "Operator", op, 1722 "LinearAssemble", CeedOperatorLinearAssemble_Cuda); 1723 CeedChkBackend(ierr); 1724 return CEED_ERROR_SUCCESS; 1725 } 1726 //------------------------------------------------------------------------------ 1727