1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed/ceed.h> 9 #include <ceed/backend.h> 10 #include <assert.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stdbool.h> 14 #include <string.h> 15 #include "ceed-cuda-ref.h" 16 #include "../cuda/ceed-cuda-compile.h" 17 18 //------------------------------------------------------------------------------ 19 // Destroy operator 20 //------------------------------------------------------------------------------ 21 static int CeedOperatorDestroy_Cuda(CeedOperator op) { 22 int ierr; 23 CeedOperator_Cuda *impl; 24 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 25 26 // Apply data 27 for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) { 28 ierr = CeedVectorDestroy(&impl->evecs[i]); CeedChkBackend(ierr); 29 } 30 ierr = CeedFree(&impl->evecs); CeedChkBackend(ierr); 31 32 for (CeedInt i = 0; i < impl->numein; i++) { 33 ierr = CeedVectorDestroy(&impl->qvecsin[i]); CeedChkBackend(ierr); 34 } 35 ierr = CeedFree(&impl->qvecsin); CeedChkBackend(ierr); 36 37 for (CeedInt i = 0; i < impl->numeout; i++) { 38 ierr = CeedVectorDestroy(&impl->qvecsout[i]); CeedChkBackend(ierr); 39 } 40 ierr = CeedFree(&impl->qvecsout); CeedChkBackend(ierr); 41 42 // QFunction assembly data 43 for (CeedInt i=0; i<impl->qfnumactivein; i++) { 44 ierr = CeedVectorDestroy(&impl->qfactivein[i]); CeedChkBackend(ierr); 45 } 46 ierr = CeedFree(&impl->qfactivein); CeedChkBackend(ierr); 47 48 // Diag data 49 if (impl->diag) { 50 Ceed ceed; 51 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 52 CeedChk_Cu(ceed, cuModuleUnload(impl->diag->module)); 53 ierr = CeedFree(&impl->diag->h_emodein); CeedChkBackend(ierr); 54 ierr = CeedFree(&impl->diag->h_emodeout); CeedChkBackend(ierr); 55 ierr = cudaFree(impl->diag->d_emodein); CeedChk_Cu(ceed, ierr); 56 ierr = cudaFree(impl->diag->d_emodeout); CeedChk_Cu(ceed, ierr); 57 ierr = cudaFree(impl->diag->d_identity); CeedChk_Cu(ceed, ierr); 58 ierr = cudaFree(impl->diag->d_interpin); CeedChk_Cu(ceed, ierr); 59 ierr = cudaFree(impl->diag->d_interpout); CeedChk_Cu(ceed, ierr); 60 ierr = cudaFree(impl->diag->d_gradin); CeedChk_Cu(ceed, ierr); 61 ierr = cudaFree(impl->diag->d_gradout); CeedChk_Cu(ceed, ierr); 62 ierr = CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr); 63 CeedChkBackend(ierr); 64 ierr = CeedVectorDestroy(&impl->diag->elemdiag); CeedChkBackend(ierr); 65 ierr = CeedVectorDestroy(&impl->diag->pbelemdiag); CeedChkBackend(ierr); 66 } 67 ierr = CeedFree(&impl->diag); CeedChkBackend(ierr); 68 69 ierr = CeedFree(&impl); CeedChkBackend(ierr); 70 return CEED_ERROR_SUCCESS; 71 } 72 73 //------------------------------------------------------------------------------ 74 // Setup infields or outfields 75 //------------------------------------------------------------------------------ 76 static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, 77 bool isinput, CeedVector *evecs, 78 CeedVector *qvecs, CeedInt starte, 79 CeedInt numfields, CeedInt Q, 80 CeedInt numelements) { 81 CeedInt dim, ierr, size; 82 Ceed ceed; 83 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 84 CeedBasis basis; 85 CeedElemRestriction Erestrict; 86 CeedOperatorField *opfields; 87 CeedQFunctionField *qffields; 88 CeedVector fieldvec; 89 bool strided; 90 bool skiprestrict; 91 92 if (isinput) { 93 ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL); 94 CeedChkBackend(ierr); 95 ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL); 96 CeedChkBackend(ierr); 97 } else { 98 ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields); 99 CeedChkBackend(ierr); 100 ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields); 101 CeedChkBackend(ierr); 102 } 103 104 // Loop over fields 105 for (CeedInt i = 0; i < numfields; i++) { 106 CeedEvalMode emode; 107 ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr); 108 109 strided = false; 110 skiprestrict = false; 111 if (emode != CEED_EVAL_WEIGHT) { 112 ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict); 113 CeedChkBackend(ierr); 114 115 // Check whether this field can skip the element restriction: 116 // must be passive input, with emode NONE, and have a strided restriction with 117 // CEED_STRIDES_BACKEND. 118 119 // First, check whether the field is input or output: 120 if (isinput) { 121 // Check for passive input: 122 ierr = CeedOperatorFieldGetVector(opfields[i], &fieldvec); CeedChkBackend(ierr); 123 if (fieldvec != CEED_VECTOR_ACTIVE) { 124 // Check emode 125 if (emode == CEED_EVAL_NONE) { 126 // Check for strided restriction 127 ierr = CeedElemRestrictionIsStrided(Erestrict, &strided); 128 CeedChkBackend(ierr); 129 if (strided) { 130 // Check if vector is already in preferred backend ordering 131 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, 132 &skiprestrict); CeedChkBackend(ierr); 133 } 134 } 135 } 136 } 137 if (skiprestrict) { 138 // We do not need an E-Vector, but will use the input field vector's data 139 // directly in the operator application. 140 evecs[i + starte] = NULL; 141 } else { 142 ierr = CeedElemRestrictionCreateVector(Erestrict, NULL, 143 &evecs[i + starte]); 144 CeedChkBackend(ierr); 145 } 146 } 147 148 switch (emode) { 149 case CEED_EVAL_NONE: 150 ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr); 151 ierr = CeedVectorCreate(ceed, numelements * Q * size, &qvecs[i]); 152 CeedChkBackend(ierr); 153 break; 154 case CEED_EVAL_INTERP: 155 ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr); 156 ierr = CeedVectorCreate(ceed, numelements * Q * size, &qvecs[i]); 157 CeedChkBackend(ierr); 158 break; 159 case CEED_EVAL_GRAD: 160 ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr); 161 ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr); 162 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 163 ierr = CeedVectorCreate(ceed, numelements * Q * size, &qvecs[i]); 164 CeedChkBackend(ierr); 165 break; 166 case CEED_EVAL_WEIGHT: // Only on input fields 167 ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr); 168 ierr = CeedVectorCreate(ceed, numelements * Q, &qvecs[i]); CeedChkBackend(ierr); 169 ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, 170 CEED_EVAL_WEIGHT, NULL, qvecs[i]); CeedChkBackend(ierr); 171 break; 172 case CEED_EVAL_DIV: 173 break; // TODO: Not implemented 174 case CEED_EVAL_CURL: 175 break; // TODO: Not implemented 176 } 177 } 178 return CEED_ERROR_SUCCESS; 179 } 180 181 //------------------------------------------------------------------------------ 182 // CeedOperator needs to connect all the named fields (be they active or passive) 183 // to the named inputs and outputs of its CeedQFunction. 184 //------------------------------------------------------------------------------ 185 static int CeedOperatorSetup_Cuda(CeedOperator op) { 186 int ierr; 187 bool setupdone; 188 ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); 189 if (setupdone) 190 return CEED_ERROR_SUCCESS; 191 Ceed ceed; 192 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 193 CeedOperator_Cuda *impl; 194 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 195 CeedQFunction qf; 196 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 197 CeedInt Q, numelements, numinputfields, numoutputfields; 198 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 199 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 200 CeedChkBackend(ierr); 201 CeedOperatorField *opinputfields, *opoutputfields; 202 ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, 203 &numoutputfields, &opoutputfields); 204 CeedChkBackend(ierr); 205 CeedQFunctionField *qfinputfields, *qfoutputfields; 206 ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 207 CeedChkBackend(ierr); 208 209 // Allocate 210 ierr = CeedCalloc(numinputfields + numoutputfields, &impl->evecs); 211 CeedChkBackend(ierr); 212 213 ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin); CeedChkBackend(ierr); 214 ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout); CeedChkBackend(ierr); 215 216 impl->numein = numinputfields; impl->numeout = numoutputfields; 217 218 // Set up infield and outfield evecs and qvecs 219 // Infields 220 ierr = CeedOperatorSetupFields_Cuda(qf, op, true, 221 impl->evecs, impl->qvecsin, 0, 222 numinputfields, Q, numelements); 223 CeedChkBackend(ierr); 224 225 // Outfields 226 ierr = CeedOperatorSetupFields_Cuda(qf, op, false, 227 impl->evecs, impl->qvecsout, 228 numinputfields, numoutputfields, Q, 229 numelements); CeedChkBackend(ierr); 230 231 ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 232 return CEED_ERROR_SUCCESS; 233 } 234 235 //------------------------------------------------------------------------------ 236 // Setup Operator Inputs 237 //------------------------------------------------------------------------------ 238 static inline int CeedOperatorSetupInputs_Cuda(CeedInt numinputfields, 239 CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, 240 CeedVector invec, const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX], 241 CeedOperator_Cuda *impl, CeedRequest *request) { 242 CeedInt ierr; 243 CeedEvalMode emode; 244 CeedVector vec; 245 CeedElemRestriction Erestrict; 246 247 for (CeedInt i = 0; i < numinputfields; i++) { 248 // Get input vector 249 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 250 if (vec == CEED_VECTOR_ACTIVE) { 251 if (skipactive) 252 continue; 253 else 254 vec = invec; 255 } 256 257 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 258 CeedChkBackend(ierr); 259 if (emode == CEED_EVAL_WEIGHT) { // Skip 260 } else { 261 // Get input vector 262 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 263 // Get input element restriction 264 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 265 CeedChkBackend(ierr); 266 if (vec == CEED_VECTOR_ACTIVE) 267 vec = invec; 268 // Restrict, if necessary 269 if (!impl->evecs[i]) { 270 // No restriction for this field; read data directly from vec. 271 ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, 272 (const CeedScalar **) &edata[i]); 273 CeedChkBackend(ierr); 274 } else { 275 ierr = CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec, 276 impl->evecs[i], request); CeedChkBackend(ierr); 277 // Get evec 278 ierr = CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE, 279 (const CeedScalar **) &edata[i]); 280 CeedChkBackend(ierr); 281 } 282 } 283 } 284 return CEED_ERROR_SUCCESS; 285 } 286 287 //------------------------------------------------------------------------------ 288 // Input Basis Action 289 //------------------------------------------------------------------------------ 290 static inline int CeedOperatorInputBasis_Cuda(CeedInt numelements, 291 CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, 292 CeedInt numinputfields, const bool skipactive, 293 CeedScalar *edata[2*CEED_FIELD_MAX],CeedOperator_Cuda *impl) { 294 CeedInt ierr; 295 CeedInt elemsize, size; 296 CeedElemRestriction Erestrict; 297 CeedEvalMode emode; 298 CeedBasis basis; 299 300 for (CeedInt i=0; i<numinputfields; i++) { 301 // Skip active input 302 if (skipactive) { 303 CeedVector vec; 304 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 305 if (vec == CEED_VECTOR_ACTIVE) 306 continue; 307 } 308 // Get elemsize, emode, size 309 ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 310 CeedChkBackend(ierr); 311 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 312 CeedChkBackend(ierr); 313 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 314 CeedChkBackend(ierr); 315 ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr); 316 // Basis action 317 switch (emode) { 318 case CEED_EVAL_NONE: 319 ierr = CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE, 320 CEED_USE_POINTER, edata[i]); CeedChkBackend(ierr); 321 break; 322 case CEED_EVAL_INTERP: 323 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); 324 CeedChkBackend(ierr); 325 ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, 326 CEED_EVAL_INTERP, impl->evecs[i], 327 impl->qvecsin[i]); CeedChkBackend(ierr); 328 break; 329 case CEED_EVAL_GRAD: 330 ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); 331 CeedChkBackend(ierr); 332 ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, 333 CEED_EVAL_GRAD, impl->evecs[i], 334 impl->qvecsin[i]); CeedChkBackend(ierr); 335 break; 336 case CEED_EVAL_WEIGHT: 337 break; // No action 338 case CEED_EVAL_DIV: 339 break; // TODO: Not implemented 340 case CEED_EVAL_CURL: 341 break; // TODO: Not implemented 342 } 343 } 344 return CEED_ERROR_SUCCESS; 345 } 346 347 //------------------------------------------------------------------------------ 348 // Restore Input Vectors 349 //------------------------------------------------------------------------------ 350 static inline int CeedOperatorRestoreInputs_Cuda(CeedInt numinputfields, 351 CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, 352 const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX], 353 CeedOperator_Cuda *impl) { 354 CeedInt ierr; 355 CeedEvalMode emode; 356 CeedVector vec; 357 358 for (CeedInt i = 0; i < numinputfields; i++) { 359 // Skip active input 360 if (skipactive) { 361 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 362 if (vec == CEED_VECTOR_ACTIVE) 363 continue; 364 } 365 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 366 CeedChkBackend(ierr); 367 if (emode == CEED_EVAL_WEIGHT) { // Skip 368 } else { 369 if (!impl->evecs[i]) { // This was a skiprestrict case 370 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 371 ierr = CeedVectorRestoreArrayRead(vec, 372 (const CeedScalar **)&edata[i]); 373 CeedChkBackend(ierr); 374 } else { 375 ierr = CeedVectorRestoreArrayRead(impl->evecs[i], 376 (const CeedScalar **) &edata[i]); 377 CeedChkBackend(ierr); 378 } 379 } 380 } 381 return CEED_ERROR_SUCCESS; 382 } 383 384 //------------------------------------------------------------------------------ 385 // Apply and add to output 386 //------------------------------------------------------------------------------ 387 static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector invec, 388 CeedVector outvec, CeedRequest *request) { 389 int ierr; 390 CeedOperator_Cuda *impl; 391 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 392 CeedQFunction qf; 393 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 394 CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size; 395 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 396 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 397 CeedChkBackend(ierr); 398 CeedOperatorField *opinputfields, *opoutputfields; 399 ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, 400 &numoutputfields, &opoutputfields); 401 CeedChkBackend(ierr); 402 CeedQFunctionField *qfinputfields, *qfoutputfields; 403 ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 404 CeedChkBackend(ierr); 405 CeedEvalMode emode; 406 CeedVector vec; 407 CeedBasis basis; 408 CeedElemRestriction Erestrict; 409 CeedScalar *edata[2*CEED_FIELD_MAX] = {0}; 410 411 // Setup 412 ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr); 413 414 // Input Evecs and Restriction 415 ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields, 416 opinputfields, invec, false, edata, 417 impl, request); CeedChkBackend(ierr); 418 419 // Input basis apply if needed 420 ierr = CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields, 421 numinputfields, false, edata, impl); 422 CeedChkBackend(ierr); 423 424 // Output pointers, as necessary 425 for (CeedInt i = 0; i < numoutputfields; i++) { 426 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 427 CeedChkBackend(ierr); 428 if (emode == CEED_EVAL_NONE) { 429 // Set the output Q-Vector to use the E-Vector data directly. 430 ierr = CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE, 431 &edata[i + numinputfields]); CeedChkBackend(ierr); 432 ierr = CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE, 433 CEED_USE_POINTER, edata[i + numinputfields]); 434 CeedChkBackend(ierr); 435 } 436 } 437 438 // Q function 439 ierr = CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout); 440 CeedChkBackend(ierr); 441 442 // Output basis apply if needed 443 for (CeedInt i = 0; i < numoutputfields; i++) { 444 // Get elemsize, emode, size 445 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 446 CeedChkBackend(ierr); 447 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 448 CeedChkBackend(ierr); 449 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 450 CeedChkBackend(ierr); 451 ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size); 452 CeedChkBackend(ierr); 453 // Basis action 454 switch (emode) { 455 case CEED_EVAL_NONE: 456 break; 457 case CEED_EVAL_INTERP: 458 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); 459 CeedChkBackend(ierr); 460 ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE, 461 CEED_EVAL_INTERP, impl->qvecsout[i], 462 impl->evecs[i + impl->numein]); CeedChkBackend(ierr); 463 break; 464 case CEED_EVAL_GRAD: 465 ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); 466 CeedChkBackend(ierr); 467 ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE, 468 CEED_EVAL_GRAD, impl->qvecsout[i], 469 impl->evecs[i + impl->numein]); CeedChkBackend(ierr); 470 break; 471 // LCOV_EXCL_START 472 case CEED_EVAL_WEIGHT: { 473 Ceed ceed; 474 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 475 return CeedError(ceed, CEED_ERROR_BACKEND, 476 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 477 break; // Should not occur 478 } 479 case CEED_EVAL_DIV: 480 break; // TODO: Not implemented 481 case CEED_EVAL_CURL: 482 break; // TODO: Not implemented 483 // LCOV_EXCL_STOP 484 } 485 } 486 487 // Output restriction 488 for (CeedInt i = 0; i < numoutputfields; i++) { 489 // Restore evec 490 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 491 CeedChkBackend(ierr); 492 if (emode == CEED_EVAL_NONE) { 493 ierr = CeedVectorRestoreArray(impl->evecs[i+impl->numein], 494 &edata[i + numinputfields]); 495 CeedChkBackend(ierr); 496 } 497 // Get output vector 498 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); 499 CeedChkBackend(ierr); 500 // Restrict 501 ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 502 CeedChkBackend(ierr); 503 // Active 504 if (vec == CEED_VECTOR_ACTIVE) 505 vec = outvec; 506 507 ierr = CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE, 508 impl->evecs[i + impl->numein], vec, 509 request); CeedChkBackend(ierr); 510 } 511 512 // Restore input arrays 513 ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields, 514 opinputfields, false, edata, impl); 515 CeedChkBackend(ierr); 516 return CEED_ERROR_SUCCESS; 517 } 518 519 //------------------------------------------------------------------------------ 520 // Core code for assembling linear QFunction 521 //------------------------------------------------------------------------------ 522 static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, 523 bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, 524 CeedRequest *request) { 525 int ierr; 526 CeedOperator_Cuda *impl; 527 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 528 CeedQFunction qf; 529 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 530 CeedInt Q, numelements, numinputfields, numoutputfields, size; 531 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 532 ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 533 CeedChkBackend(ierr); 534 CeedOperatorField *opinputfields, *opoutputfields; 535 ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, 536 &numoutputfields, &opoutputfields); 537 CeedChkBackend(ierr); 538 CeedQFunctionField *qfinputfields, *qfoutputfields; 539 ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 540 CeedChkBackend(ierr); 541 CeedVector vec; 542 CeedInt numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout; 543 CeedVector *activein = impl->qfactivein; 544 CeedScalar *a, *tmp; 545 Ceed ceed, ceedparent; 546 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 547 ierr = CeedGetOperatorFallbackParentCeed(ceed, &ceedparent); 548 CeedChkBackend(ierr); 549 ceedparent = ceedparent ? ceedparent : ceed; 550 CeedScalar *edata[2*CEED_FIELD_MAX]; 551 552 // Setup 553 ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr); 554 555 // Check for identity 556 bool identityqf; 557 ierr = CeedQFunctionIsIdentity(qf, &identityqf); CeedChkBackend(ierr); 558 if (identityqf) 559 // LCOV_EXCL_START 560 return CeedError(ceed, CEED_ERROR_BACKEND, 561 "Assembling identity QFunctions not supported"); 562 // LCOV_EXCL_STOP 563 564 // Input Evecs and Restriction 565 ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields, 566 opinputfields, NULL, true, edata, 567 impl, request); CeedChkBackend(ierr); 568 569 // Count number of active input fields 570 if (!numactivein) { 571 for (CeedInt i=0; i<numinputfields; i++) { 572 // Get input vector 573 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 574 // Check if active input 575 if (vec == CEED_VECTOR_ACTIVE) { 576 ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr); 577 ierr = CeedVectorSetValue(impl->qvecsin[i], 0.0); CeedChkBackend(ierr); 578 ierr = CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp); 579 CeedChkBackend(ierr); 580 ierr = CeedRealloc(numactivein + size, &activein); CeedChkBackend(ierr); 581 for (CeedInt field = 0; field < size; field++) { 582 ierr = CeedVectorCreate(ceed, Q*numelements, 583 &activein[numactivein+field]); CeedChkBackend(ierr); 584 ierr = CeedVectorSetArray(activein[numactivein+field], CEED_MEM_DEVICE, 585 CEED_USE_POINTER, &tmp[field*Q*numelements]); 586 CeedChkBackend(ierr); 587 } 588 numactivein += size; 589 ierr = CeedVectorRestoreArray(impl->qvecsin[i], &tmp); CeedChkBackend(ierr); 590 } 591 } 592 impl->qfnumactivein = numactivein; 593 impl->qfactivein = activein; 594 } 595 596 // Count number of active output fields 597 if (!numactiveout) { 598 for (CeedInt i=0; i<numoutputfields; i++) { 599 // Get output vector 600 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); 601 CeedChkBackend(ierr); 602 // Check if active output 603 if (vec == CEED_VECTOR_ACTIVE) { 604 ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size); 605 CeedChkBackend(ierr); 606 numactiveout += size; 607 } 608 } 609 impl->qfnumactiveout = numactiveout; 610 } 611 612 // Check sizes 613 if (!numactivein || !numactiveout) 614 // LCOV_EXCL_START 615 return CeedError(ceed, CEED_ERROR_BACKEND, 616 "Cannot assemble QFunction without active inputs " 617 "and outputs"); 618 // LCOV_EXCL_STOP 619 620 // Build objects if needed 621 if (build_objects) { 622 // Create output restriction 623 CeedInt strides[3] = {1, numelements*Q, Q}; /* *NOPAD* */ 624 ierr = CeedElemRestrictionCreateStrided(ceedparent, numelements, Q, 625 numactivein*numactiveout, 626 numactivein*numactiveout*numelements*Q, 627 strides, rstr); CeedChkBackend(ierr); 628 // Create assembled vector 629 ierr = CeedVectorCreate(ceedparent, numelements*Q*numactivein*numactiveout, 630 assembled); CeedChkBackend(ierr); 631 } 632 ierr = CeedVectorSetValue(*assembled, 0.0); CeedChkBackend(ierr); 633 ierr = CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &a); 634 CeedChkBackend(ierr); 635 636 // Input basis apply 637 ierr = CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields, 638 numinputfields, true, edata, impl); 639 CeedChkBackend(ierr); 640 641 // Assemble QFunction 642 for (CeedInt in=0; in<numactivein; in++) { 643 // Set Inputs 644 ierr = CeedVectorSetValue(activein[in], 1.0); CeedChkBackend(ierr); 645 if (numactivein > 1) { 646 ierr = CeedVectorSetValue(activein[(in+numactivein-1)%numactivein], 647 0.0); CeedChkBackend(ierr); 648 } 649 // Set Outputs 650 for (CeedInt out=0; out<numoutputfields; out++) { 651 // Get output vector 652 ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec); 653 CeedChkBackend(ierr); 654 // Check if active output 655 if (vec == CEED_VECTOR_ACTIVE) { 656 CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE, 657 CEED_USE_POINTER, a); CeedChkBackend(ierr); 658 ierr = CeedQFunctionFieldGetSize(qfoutputfields[out], &size); 659 CeedChkBackend(ierr); 660 a += size*Q*numelements; // Advance the pointer by the size of the output 661 } 662 } 663 // Apply QFunction 664 ierr = CeedQFunctionApply(qf, Q*numelements, impl->qvecsin, impl->qvecsout); 665 CeedChkBackend(ierr); 666 } 667 668 // Un-set output Qvecs to prevent accidental overwrite of Assembled 669 for (CeedInt out=0; out<numoutputfields; out++) { 670 // Get output vector 671 ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec); 672 CeedChkBackend(ierr); 673 // Check if active output 674 if (vec == CEED_VECTOR_ACTIVE) { 675 ierr = CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL); 676 CeedChkBackend(ierr); 677 } 678 } 679 680 // Restore input arrays 681 ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields, 682 opinputfields, true, edata, impl); 683 CeedChkBackend(ierr); 684 685 // Restore output 686 ierr = CeedVectorRestoreArray(*assembled, &a); CeedChkBackend(ierr); 687 688 return CEED_ERROR_SUCCESS; 689 } 690 691 //------------------------------------------------------------------------------ 692 // Assemble Linear QFunction 693 //------------------------------------------------------------------------------ 694 static int CeedOperatorLinearAssembleQFunction_Cuda(CeedOperator op, 695 CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { 696 return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, true, assembled, rstr, 697 request); 698 } 699 700 //------------------------------------------------------------------------------ 701 // Update Assembled Linear QFunction 702 //------------------------------------------------------------------------------ 703 static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op, 704 CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { 705 return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, false, &assembled, 706 &rstr, request); 707 } 708 709 //------------------------------------------------------------------------------ 710 // Diagonal assembly kernels 711 //------------------------------------------------------------------------------ 712 // *INDENT-OFF* 713 static const char *diagonalkernels = QUOTE( 714 715 typedef enum { 716 /// Perform no evaluation (either because there is no data or it is already at 717 /// quadrature points) 718 CEED_EVAL_NONE = 0, 719 /// Interpolate from nodes to quadrature points 720 CEED_EVAL_INTERP = 1, 721 /// Evaluate gradients at quadrature points from input in a nodal basis 722 CEED_EVAL_GRAD = 2, 723 /// Evaluate divergence at quadrature points from input in a nodal basis 724 CEED_EVAL_DIV = 4, 725 /// Evaluate curl at quadrature points from input in a nodal basis 726 CEED_EVAL_CURL = 8, 727 /// Using no input, evaluate quadrature weights on the reference element 728 CEED_EVAL_WEIGHT = 16, 729 } CeedEvalMode; 730 731 //------------------------------------------------------------------------------ 732 // Get Basis Emode Pointer 733 //------------------------------------------------------------------------------ 734 extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **basisptr, 735 CeedEvalMode emode, const CeedScalar *identity, const CeedScalar *interp, 736 const CeedScalar *grad) { 737 switch (emode) { 738 case CEED_EVAL_NONE: 739 *basisptr = identity; 740 break; 741 case CEED_EVAL_INTERP: 742 *basisptr = interp; 743 break; 744 case CEED_EVAL_GRAD: 745 *basisptr = grad; 746 break; 747 case CEED_EVAL_WEIGHT: 748 case CEED_EVAL_DIV: 749 case CEED_EVAL_CURL: 750 break; // Caught by QF Assembly 751 } 752 } 753 754 //------------------------------------------------------------------------------ 755 // Core code for diagonal assembly 756 //------------------------------------------------------------------------------ 757 __device__ void diagonalCore(const CeedInt nelem, 758 const CeedScalar maxnorm, const bool pointBlock, 759 const CeedScalar *identity, 760 const CeedScalar *interpin, const CeedScalar *gradin, 761 const CeedScalar *interpout, const CeedScalar *gradout, 762 const CeedEvalMode *emodein, const CeedEvalMode *emodeout, 763 const CeedScalar *__restrict__ assembledqfarray, 764 CeedScalar *__restrict__ elemdiagarray) { 765 const int tid = threadIdx.x; // running with P threads, tid is evec node 766 const CeedScalar qfvaluebound = maxnorm*1e-12; 767 768 // Compute the diagonal of B^T D B 769 // Each element 770 for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < nelem; 771 e += gridDim.x*blockDim.z) { 772 CeedInt dout = -1; 773 // Each basis eval mode pair 774 for (CeedInt eout = 0; eout < NUMEMODEOUT; eout++) { 775 const CeedScalar *bt = NULL; 776 if (emodeout[eout] == CEED_EVAL_GRAD) 777 dout += 1; 778 CeedOperatorGetBasisPointer_Cuda(&bt, emodeout[eout], identity, interpout, 779 &gradout[dout*NQPTS*NNODES]); 780 CeedInt din = -1; 781 for (CeedInt ein = 0; ein < NUMEMODEIN; ein++) { 782 const CeedScalar *b = NULL; 783 if (emodein[ein] == CEED_EVAL_GRAD) 784 din += 1; 785 CeedOperatorGetBasisPointer_Cuda(&b, emodein[ein], identity, interpin, 786 &gradin[din*NQPTS*NNODES]); 787 // Each component 788 for (CeedInt compOut = 0; compOut < NCOMP; compOut++) { 789 // Each qpoint/node pair 790 if (pointBlock) { 791 // Point Block Diagonal 792 for (CeedInt compIn = 0; compIn < NCOMP; compIn++) { 793 CeedScalar evalue = 0.; 794 for (CeedInt q = 0; q < NQPTS; q++) { 795 const CeedScalar qfvalue = 796 assembledqfarray[((((ein*NCOMP+compIn)*NUMEMODEOUT+eout)* 797 NCOMP+compOut)*nelem+e)*NQPTS+q]; 798 if (abs(qfvalue) > qfvaluebound) 799 evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid]; 800 } 801 elemdiagarray[((compOut*NCOMP+compIn)*nelem+e)*NNODES+tid] += evalue; 802 } 803 } else { 804 // Diagonal Only 805 CeedScalar evalue = 0.; 806 for (CeedInt q = 0; q < NQPTS; q++) { 807 const CeedScalar qfvalue = 808 assembledqfarray[((((ein*NCOMP+compOut)*NUMEMODEOUT+eout)* 809 NCOMP+compOut)*nelem+e)*NQPTS+q]; 810 if (abs(qfvalue) > qfvaluebound) 811 evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid]; 812 } 813 elemdiagarray[(compOut*nelem+e)*NNODES+tid] += evalue; 814 } 815 } 816 } 817 } 818 } 819 } 820 821 //------------------------------------------------------------------------------ 822 // Linear diagonal 823 //------------------------------------------------------------------------------ 824 extern "C" __global__ void linearDiagonal(const CeedInt nelem, 825 const CeedScalar maxnorm, const CeedScalar *identity, 826 const CeedScalar *interpin, const CeedScalar *gradin, 827 const CeedScalar *interpout, const CeedScalar *gradout, 828 const CeedEvalMode *emodein, const CeedEvalMode *emodeout, 829 const CeedScalar *__restrict__ assembledqfarray, 830 CeedScalar *__restrict__ elemdiagarray) { 831 diagonalCore(nelem, maxnorm, false, identity, interpin, gradin, interpout, 832 gradout, emodein, emodeout, assembledqfarray, elemdiagarray); 833 } 834 835 //------------------------------------------------------------------------------ 836 // Linear point block diagonal 837 //------------------------------------------------------------------------------ 838 extern "C" __global__ void linearPointBlockDiagonal(const CeedInt nelem, 839 const CeedScalar maxnorm, const CeedScalar *identity, 840 const CeedScalar *interpin, const CeedScalar *gradin, 841 const CeedScalar *interpout, const CeedScalar *gradout, 842 const CeedEvalMode *emodein, const CeedEvalMode *emodeout, 843 const CeedScalar *__restrict__ assembledqfarray, 844 CeedScalar *__restrict__ elemdiagarray) { 845 diagonalCore(nelem, maxnorm, true, identity, interpin, gradin, interpout, 846 gradout, emodein, emodeout, assembledqfarray, elemdiagarray); 847 } 848 849 ); 850 // *INDENT-ON* 851 852 //------------------------------------------------------------------------------ 853 // Create point block restriction 854 //------------------------------------------------------------------------------ 855 static int CreatePBRestriction(CeedElemRestriction rstr, 856 CeedElemRestriction *pbRstr) { 857 int ierr; 858 Ceed ceed; 859 ierr = CeedElemRestrictionGetCeed(rstr, &ceed); CeedChkBackend(ierr); 860 const CeedInt *offsets; 861 ierr = CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets); 862 CeedChkBackend(ierr); 863 864 // Expand offsets 865 CeedInt nelem, ncomp, elemsize, compstride, max = 1, *pbOffsets; 866 ierr = CeedElemRestrictionGetNumElements(rstr, &nelem); CeedChkBackend(ierr); 867 ierr = CeedElemRestrictionGetNumComponents(rstr, &ncomp); CeedChkBackend(ierr); 868 ierr = CeedElemRestrictionGetElementSize(rstr, &elemsize); CeedChkBackend(ierr); 869 ierr = CeedElemRestrictionGetCompStride(rstr, &compstride); 870 CeedChkBackend(ierr); 871 CeedInt shift = ncomp; 872 if (compstride != 1) 873 shift *= ncomp; 874 ierr = CeedCalloc(nelem*elemsize, &pbOffsets); CeedChkBackend(ierr); 875 for (CeedInt i = 0; i < nelem*elemsize; i++) { 876 pbOffsets[i] = offsets[i]*shift; 877 if (pbOffsets[i] > max) 878 max = pbOffsets[i]; 879 } 880 881 // Create new restriction 882 ierr = CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp*ncomp, 1, 883 max + ncomp*ncomp, CEED_MEM_HOST, 884 CEED_OWN_POINTER, pbOffsets, pbRstr); 885 CeedChkBackend(ierr); 886 887 // Cleanup 888 ierr = CeedElemRestrictionRestoreOffsets(rstr, &offsets); CeedChkBackend(ierr); 889 890 return CEED_ERROR_SUCCESS; 891 } 892 893 //------------------------------------------------------------------------------ 894 // Assemble diagonal setup 895 //------------------------------------------------------------------------------ 896 static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, 897 const bool pointBlock) { 898 int ierr; 899 Ceed ceed; 900 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 901 CeedQFunction qf; 902 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 903 CeedInt numinputfields, numoutputfields; 904 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 905 CeedChkBackend(ierr); 906 907 // Determine active input basis 908 CeedOperatorField *opfields; 909 CeedQFunctionField *qffields; 910 ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL); 911 CeedChkBackend(ierr); 912 ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL); 913 CeedChkBackend(ierr); 914 CeedInt numemodein = 0, ncomp = 0, dim = 1; 915 CeedEvalMode *emodein = NULL; 916 CeedBasis basisin = NULL; 917 CeedElemRestriction rstrin = NULL; 918 for (CeedInt i = 0; i < numinputfields; i++) { 919 CeedVector vec; 920 ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr); 921 if (vec == CEED_VECTOR_ACTIVE) { 922 CeedElemRestriction rstr; 923 ierr = CeedOperatorFieldGetBasis(opfields[i], &basisin); CeedChkBackend(ierr); 924 ierr = CeedBasisGetNumComponents(basisin, &ncomp); CeedChkBackend(ierr); 925 ierr = CeedBasisGetDimension(basisin, &dim); CeedChkBackend(ierr); 926 ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr); 927 CeedChkBackend(ierr); 928 if (rstrin && rstrin != rstr) 929 // LCOV_EXCL_START 930 return CeedError(ceed, CEED_ERROR_BACKEND, 931 "Multi-field non-composite operator diagonal assembly not supported"); 932 // LCOV_EXCL_STOP 933 rstrin = rstr; 934 CeedEvalMode emode; 935 ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); 936 CeedChkBackend(ierr); 937 switch (emode) { 938 case CEED_EVAL_NONE: 939 case CEED_EVAL_INTERP: 940 ierr = CeedRealloc(numemodein + 1, &emodein); CeedChkBackend(ierr); 941 emodein[numemodein] = emode; 942 numemodein += 1; 943 break; 944 case CEED_EVAL_GRAD: 945 ierr = CeedRealloc(numemodein + dim, &emodein); CeedChkBackend(ierr); 946 for (CeedInt d = 0; d < dim; d++) 947 emodein[numemodein+d] = emode; 948 numemodein += dim; 949 break; 950 case CEED_EVAL_WEIGHT: 951 case CEED_EVAL_DIV: 952 case CEED_EVAL_CURL: 953 break; // Caught by QF Assembly 954 } 955 } 956 } 957 958 // Determine active output basis 959 ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields); 960 CeedChkBackend(ierr); 961 ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields); 962 CeedChkBackend(ierr); 963 CeedInt numemodeout = 0; 964 CeedEvalMode *emodeout = NULL; 965 CeedBasis basisout = NULL; 966 CeedElemRestriction rstrout = NULL; 967 for (CeedInt i = 0; i < numoutputfields; i++) { 968 CeedVector vec; 969 ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr); 970 if (vec == CEED_VECTOR_ACTIVE) { 971 CeedElemRestriction rstr; 972 ierr = CeedOperatorFieldGetBasis(opfields[i], &basisout); CeedChkBackend(ierr); 973 ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr); 974 CeedChkBackend(ierr); 975 if (rstrout && rstrout != rstr) 976 // LCOV_EXCL_START 977 return CeedError(ceed, CEED_ERROR_BACKEND, 978 "Multi-field non-composite operator diagonal assembly not supported"); 979 // LCOV_EXCL_STOP 980 rstrout = rstr; 981 CeedEvalMode emode; 982 ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr); 983 switch (emode) { 984 case CEED_EVAL_NONE: 985 case CEED_EVAL_INTERP: 986 ierr = CeedRealloc(numemodeout + 1, &emodeout); CeedChkBackend(ierr); 987 emodeout[numemodeout] = emode; 988 numemodeout += 1; 989 break; 990 case CEED_EVAL_GRAD: 991 ierr = CeedRealloc(numemodeout + dim, &emodeout); CeedChkBackend(ierr); 992 for (CeedInt d = 0; d < dim; d++) 993 emodeout[numemodeout+d] = emode; 994 numemodeout += dim; 995 break; 996 case CEED_EVAL_WEIGHT: 997 case CEED_EVAL_DIV: 998 case CEED_EVAL_CURL: 999 break; // Caught by QF Assembly 1000 } 1001 } 1002 } 1003 1004 // Operator data struct 1005 CeedOperator_Cuda *impl; 1006 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 1007 ierr = CeedCalloc(1, &impl->diag); CeedChkBackend(ierr); 1008 CeedOperatorDiag_Cuda *diag = impl->diag; 1009 diag->basisin = basisin; 1010 diag->basisout = basisout; 1011 diag->h_emodein = emodein; 1012 diag->h_emodeout = emodeout; 1013 diag->numemodein = numemodein; 1014 diag->numemodeout = numemodeout; 1015 1016 // Assemble kernel 1017 CeedInt nnodes, nqpts; 1018 ierr = CeedBasisGetNumNodes(basisin, &nnodes); CeedChkBackend(ierr); 1019 ierr = CeedBasisGetNumQuadraturePoints(basisin, &nqpts); CeedChkBackend(ierr); 1020 diag->nnodes = nnodes; 1021 ierr = CeedCompileCuda(ceed, diagonalkernels, &diag->module, 5, 1022 "NUMEMODEIN", numemodein, 1023 "NUMEMODEOUT", numemodeout, 1024 "NNODES", nnodes, 1025 "NQPTS", nqpts, 1026 "NCOMP", ncomp 1027 ); CeedChk_Cu(ceed, ierr); 1028 ierr = CeedGetKernelCuda(ceed, diag->module, "linearDiagonal", 1029 &diag->linearDiagonal); CeedChk_Cu(ceed, ierr); 1030 ierr = CeedGetKernelCuda(ceed, diag->module, "linearPointBlockDiagonal", 1031 &diag->linearPointBlock); 1032 CeedChk_Cu(ceed, ierr); 1033 1034 // Basis matrices 1035 const CeedInt qBytes = nqpts * sizeof(CeedScalar); 1036 const CeedInt iBytes = qBytes * nnodes; 1037 const CeedInt gBytes = qBytes * nnodes * dim; 1038 const CeedInt eBytes = sizeof(CeedEvalMode); 1039 const CeedScalar *interpin, *interpout, *gradin, *gradout; 1040 1041 // CEED_EVAL_NONE 1042 CeedScalar *identity = NULL; 1043 bool evalNone = false; 1044 for (CeedInt i=0; i<numemodein; i++) 1045 evalNone = evalNone || (emodein[i] == CEED_EVAL_NONE); 1046 for (CeedInt i=0; i<numemodeout; i++) 1047 evalNone = evalNone || (emodeout[i] == CEED_EVAL_NONE); 1048 if (evalNone) { 1049 ierr = CeedCalloc(nqpts*nnodes, &identity); CeedChkBackend(ierr); 1050 for (CeedInt i=0; i<(nnodes<nqpts?nnodes:nqpts); i++) 1051 identity[i*nnodes+i] = 1.0; 1052 ierr = cudaMalloc((void **)&diag->d_identity, iBytes); CeedChk_Cu(ceed, ierr); 1053 ierr = cudaMemcpy(diag->d_identity, identity, iBytes, 1054 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1055 } 1056 1057 // CEED_EVAL_INTERP 1058 ierr = CeedBasisGetInterp(basisin, &interpin); CeedChkBackend(ierr); 1059 ierr = cudaMalloc((void **)&diag->d_interpin, iBytes); CeedChk_Cu(ceed, ierr); 1060 ierr = cudaMemcpy(diag->d_interpin, interpin, iBytes, 1061 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1062 ierr = CeedBasisGetInterp(basisout, &interpout); CeedChkBackend(ierr); 1063 ierr = cudaMalloc((void **)&diag->d_interpout, iBytes); CeedChk_Cu(ceed, ierr); 1064 ierr = cudaMemcpy(diag->d_interpout, interpout, iBytes, 1065 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1066 1067 // CEED_EVAL_GRAD 1068 ierr = CeedBasisGetGrad(basisin, &gradin); CeedChkBackend(ierr); 1069 ierr = cudaMalloc((void **)&diag->d_gradin, gBytes); CeedChk_Cu(ceed, ierr); 1070 ierr = cudaMemcpy(diag->d_gradin, gradin, gBytes, 1071 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1072 ierr = CeedBasisGetGrad(basisout, &gradout); CeedChkBackend(ierr); 1073 ierr = cudaMalloc((void **)&diag->d_gradout, gBytes); CeedChk_Cu(ceed, ierr); 1074 ierr = cudaMemcpy(diag->d_gradout, gradout, gBytes, 1075 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1076 1077 // Arrays of emodes 1078 ierr = cudaMalloc((void **)&diag->d_emodein, numemodein * eBytes); 1079 CeedChk_Cu(ceed, ierr); 1080 ierr = cudaMemcpy(diag->d_emodein, emodein, numemodein * eBytes, 1081 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1082 ierr = cudaMalloc((void **)&diag->d_emodeout, numemodeout * eBytes); 1083 CeedChk_Cu(ceed, ierr); 1084 ierr = cudaMemcpy(diag->d_emodeout, emodeout, numemodeout * eBytes, 1085 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1086 1087 // Restriction 1088 diag->diagrstr = rstrout; 1089 1090 return CEED_ERROR_SUCCESS; 1091 } 1092 1093 //------------------------------------------------------------------------------ 1094 // Assemble diagonal common code 1095 //------------------------------------------------------------------------------ 1096 static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, 1097 CeedVector assembled, CeedRequest *request, const bool pointBlock) { 1098 int ierr; 1099 Ceed ceed; 1100 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1101 CeedOperator_Cuda *impl; 1102 ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 1103 1104 // Assemble QFunction 1105 CeedVector assembledqf; 1106 CeedElemRestriction rstr; 1107 ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, 1108 &rstr, request); CeedChkBackend(ierr); 1109 ierr = CeedElemRestrictionDestroy(&rstr); CeedChkBackend(ierr); 1110 CeedScalar maxnorm = 0; 1111 ierr = CeedVectorNorm(assembledqf, CEED_NORM_MAX, &maxnorm); 1112 CeedChkBackend(ierr); 1113 1114 // Setup 1115 if (!impl->diag) { 1116 ierr = CeedOperatorAssembleDiagonalSetup_Cuda(op, pointBlock); 1117 CeedChkBackend(ierr); 1118 } 1119 CeedOperatorDiag_Cuda *diag = impl->diag; 1120 assert(diag != NULL); 1121 1122 // Restriction 1123 if (pointBlock && !diag->pbdiagrstr) { 1124 CeedElemRestriction pbdiagrstr; 1125 ierr = CreatePBRestriction(diag->diagrstr, &pbdiagrstr); CeedChkBackend(ierr); 1126 diag->pbdiagrstr = pbdiagrstr; 1127 } 1128 CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr; 1129 1130 // Create diagonal vector 1131 CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag; 1132 if (!elemdiag) { 1133 ierr = CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag); 1134 CeedChkBackend(ierr); 1135 if (pointBlock) 1136 diag->pbelemdiag = elemdiag; 1137 else 1138 diag->elemdiag = elemdiag; 1139 } 1140 ierr = CeedVectorSetValue(elemdiag, 0.0); CeedChkBackend(ierr); 1141 1142 // Assemble element operator diagonals 1143 CeedScalar *elemdiagarray; 1144 const CeedScalar *assembledqfarray; 1145 ierr = CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray); 1146 CeedChkBackend(ierr); 1147 ierr = CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray); 1148 CeedChkBackend(ierr); 1149 CeedInt nelem; 1150 ierr = CeedElemRestrictionGetNumElements(diagrstr, &nelem); 1151 CeedChkBackend(ierr); 1152 1153 // Compute the diagonal of B^T D B 1154 int elemsPerBlock = 1; 1155 int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0); 1156 void *args[] = {(void *) &nelem, (void *) &maxnorm, &diag->d_identity, 1157 &diag->d_interpin, &diag->d_gradin, &diag->d_interpout, 1158 &diag->d_gradout, &diag->d_emodein, &diag->d_emodeout, 1159 &assembledqfarray, &elemdiagarray 1160 }; 1161 if (pointBlock) { 1162 ierr = CeedRunKernelDimCuda(ceed, diag->linearPointBlock, grid, 1163 diag->nnodes, 1, elemsPerBlock, args); 1164 CeedChkBackend(ierr); 1165 } else { 1166 ierr = CeedRunKernelDimCuda(ceed, diag->linearDiagonal, grid, 1167 diag->nnodes, 1, elemsPerBlock, args); 1168 CeedChkBackend(ierr); 1169 } 1170 1171 // Restore arrays 1172 ierr = CeedVectorRestoreArray(elemdiag, &elemdiagarray); CeedChkBackend(ierr); 1173 ierr = CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray); 1174 CeedChkBackend(ierr); 1175 1176 // Assemble local operator diagonal 1177 ierr = CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag, 1178 assembled, request); CeedChkBackend(ierr); 1179 1180 // Cleanup 1181 ierr = CeedVectorDestroy(&assembledqf); CeedChkBackend(ierr); 1182 1183 return CEED_ERROR_SUCCESS; 1184 } 1185 1186 //------------------------------------------------------------------------------ 1187 // Assemble composite diagonal common code 1188 //------------------------------------------------------------------------------ 1189 static inline int CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda( 1190 CeedOperator op, CeedVector assembled, CeedRequest *request, 1191 const bool pointBlock) { 1192 int ierr; 1193 CeedInt numSub; 1194 CeedOperator *subOperators; 1195 ierr = CeedOperatorGetNumSub(op, &numSub); CeedChkBackend(ierr); 1196 ierr = CeedOperatorGetSubList(op, &subOperators); CeedChkBackend(ierr); 1197 for (CeedInt i = 0; i < numSub; i++) { 1198 ierr = CeedOperatorAssembleDiagonalCore_Cuda(subOperators[i], assembled, 1199 request, pointBlock); CeedChkBackend(ierr); 1200 } 1201 return CEED_ERROR_SUCCESS; 1202 } 1203 1204 //------------------------------------------------------------------------------ 1205 // Assemble Linear Diagonal 1206 //------------------------------------------------------------------------------ 1207 static int CeedOperatorLinearAssembleAddDiagonal_Cuda(CeedOperator op, 1208 CeedVector assembled, CeedRequest *request) { 1209 int ierr; 1210 bool isComposite; 1211 ierr = CeedOperatorIsComposite(op, &isComposite); CeedChkBackend(ierr); 1212 if (isComposite) { 1213 return CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda(op, assembled, 1214 request, false); 1215 } else { 1216 return CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, false); 1217 } 1218 } 1219 1220 //------------------------------------------------------------------------------ 1221 // Assemble Linear Point Block Diagonal 1222 //------------------------------------------------------------------------------ 1223 static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op, 1224 CeedVector assembled, CeedRequest *request) { 1225 int ierr; 1226 bool isComposite; 1227 ierr = CeedOperatorIsComposite(op, &isComposite); CeedChkBackend(ierr); 1228 if (isComposite) { 1229 return CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda(op, assembled, 1230 request, true); 1231 } else { 1232 return CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, true); 1233 } 1234 } 1235 1236 //------------------------------------------------------------------------------ 1237 // Create operator 1238 //------------------------------------------------------------------------------ 1239 int CeedOperatorCreate_Cuda(CeedOperator op) { 1240 int ierr; 1241 Ceed ceed; 1242 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1243 CeedOperator_Cuda *impl; 1244 1245 ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); 1246 ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr); 1247 1248 ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", 1249 CeedOperatorLinearAssembleQFunction_Cuda); 1250 CeedChkBackend(ierr); 1251 ierr = CeedSetBackendFunction(ceed, "Operator", op, 1252 "LinearAssembleQFunctionUpdate", 1253 CeedOperatorLinearAssembleQFunctionUpdate_Cuda); 1254 CeedChkBackend(ierr); 1255 ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", 1256 CeedOperatorLinearAssembleAddDiagonal_Cuda); 1257 CeedChkBackend(ierr); 1258 ierr = CeedSetBackendFunction(ceed, "Operator", op, 1259 "LinearAssembleAddPointBlockDiagonal", 1260 CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda); 1261 CeedChkBackend(ierr); 1262 ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", 1263 CeedOperatorApplyAdd_Cuda); CeedChkBackend(ierr); 1264 ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", 1265 CeedOperatorDestroy_Cuda); CeedChkBackend(ierr); 1266 return CEED_ERROR_SUCCESS; 1267 } 1268 1269 //------------------------------------------------------------------------------ 1270 // Composite Operator Create 1271 //------------------------------------------------------------------------------ 1272 int CeedCompositeOperatorCreate_Cuda(CeedOperator op) { 1273 int ierr; 1274 Ceed ceed; 1275 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1276 1277 ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", 1278 CeedOperatorLinearAssembleAddDiagonal_Cuda); 1279 CeedChkBackend(ierr); 1280 ierr = CeedSetBackendFunction(ceed, "Operator", op, 1281 "LinearAssembleAddPointBlockDiagonal", 1282 CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda); 1283 CeedChkBackend(ierr); 1284 return CEED_ERROR_SUCCESS; 1285 } 1286 //------------------------------------------------------------------------------ 1287