1*0d0321e0SJeremy L Thompson // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2*0d0321e0SJeremy L Thompson // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3*0d0321e0SJeremy L Thompson // All Rights reserved. See files LICENSE and NOTICE for details. 4*0d0321e0SJeremy L Thompson // 5*0d0321e0SJeremy L Thompson // This file is part of CEED, a collection of benchmarks, miniapps, software 6*0d0321e0SJeremy L Thompson // libraries and APIs for efficient high-order finite element and spectral 7*0d0321e0SJeremy L Thompson // element discretizations for exascale applications. For more information and 8*0d0321e0SJeremy L Thompson // source code availability see http://github.com/ceed. 9*0d0321e0SJeremy L Thompson // 10*0d0321e0SJeremy L Thompson // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11*0d0321e0SJeremy L Thompson // a collaborative effort of two U.S. Department of Energy organizations (Office 12*0d0321e0SJeremy L Thompson // of Science and the National Nuclear Security Administration) responsible for 13*0d0321e0SJeremy L Thompson // the planning and preparation of a capable exascale ecosystem, including 14*0d0321e0SJeremy L Thompson // software, applications, hardware, advanced system engineering and early 15*0d0321e0SJeremy L Thompson // testbed platforms, in support of the nation's exascale computing imperative. 16*0d0321e0SJeremy L Thompson 17*0d0321e0SJeremy L Thompson #include <ceed/ceed.h> 18*0d0321e0SJeremy L Thompson #include <ceed/backend.h> 19*0d0321e0SJeremy L Thompson #include <assert.h> 20*0d0321e0SJeremy L Thompson #include <cuda.h> 21*0d0321e0SJeremy L Thompson #include <cuda_runtime.h> 22*0d0321e0SJeremy L Thompson #include <stdbool.h> 23*0d0321e0SJeremy L Thompson #include <string.h> 24*0d0321e0SJeremy L Thompson #include "ceed-cuda-ref.h" 25*0d0321e0SJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 26*0d0321e0SJeremy L Thompson 27*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 28*0d0321e0SJeremy L Thompson // Destroy operator 29*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 30*0d0321e0SJeremy L Thompson static int CeedOperatorDestroy_Cuda(CeedOperator op) { 31*0d0321e0SJeremy L Thompson int ierr; 32*0d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 33*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 34*0d0321e0SJeremy L Thompson 35*0d0321e0SJeremy L Thompson // Apply data 36*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) { 37*0d0321e0SJeremy L Thompson ierr = CeedVectorDestroy(&impl->evecs[i]); CeedChkBackend(ierr); 38*0d0321e0SJeremy L Thompson } 39*0d0321e0SJeremy L Thompson ierr = CeedFree(&impl->evecs); CeedChkBackend(ierr); 40*0d0321e0SJeremy L Thompson 41*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < impl->numein; i++) { 42*0d0321e0SJeremy L Thompson ierr = CeedVectorDestroy(&impl->qvecsin[i]); CeedChkBackend(ierr); 43*0d0321e0SJeremy L Thompson } 44*0d0321e0SJeremy L Thompson ierr = CeedFree(&impl->qvecsin); CeedChkBackend(ierr); 45*0d0321e0SJeremy L Thompson 46*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < impl->numeout; i++) { 47*0d0321e0SJeremy L Thompson ierr = CeedVectorDestroy(&impl->qvecsout[i]); CeedChkBackend(ierr); 48*0d0321e0SJeremy L Thompson } 49*0d0321e0SJeremy L Thompson ierr = CeedFree(&impl->qvecsout); CeedChkBackend(ierr); 50*0d0321e0SJeremy L Thompson 51*0d0321e0SJeremy L Thompson // QFunction assembly data 52*0d0321e0SJeremy L Thompson for (CeedInt i=0; i<impl->qfnumactivein; i++) { 53*0d0321e0SJeremy L Thompson ierr = CeedVectorDestroy(&impl->qfactivein[i]); CeedChkBackend(ierr); 54*0d0321e0SJeremy L Thompson } 55*0d0321e0SJeremy L Thompson ierr = CeedFree(&impl->qfactivein); CeedChkBackend(ierr); 56*0d0321e0SJeremy L Thompson 57*0d0321e0SJeremy L Thompson // Diag data 58*0d0321e0SJeremy L Thompson if (impl->diag) { 59*0d0321e0SJeremy L Thompson Ceed ceed; 60*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 61*0d0321e0SJeremy L Thompson CeedChk_Cu(ceed, cuModuleUnload(impl->diag->module)); 62*0d0321e0SJeremy L Thompson ierr = CeedFree(&impl->diag->h_emodein); CeedChkBackend(ierr); 63*0d0321e0SJeremy L Thompson ierr = CeedFree(&impl->diag->h_emodeout); CeedChkBackend(ierr); 64*0d0321e0SJeremy L Thompson ierr = cudaFree(impl->diag->d_emodein); CeedChk_Cu(ceed, ierr); 65*0d0321e0SJeremy L Thompson ierr = cudaFree(impl->diag->d_emodeout); CeedChk_Cu(ceed, ierr); 66*0d0321e0SJeremy L Thompson ierr = cudaFree(impl->diag->d_identity); CeedChk_Cu(ceed, ierr); 67*0d0321e0SJeremy L Thompson ierr = cudaFree(impl->diag->d_interpin); CeedChk_Cu(ceed, ierr); 68*0d0321e0SJeremy L Thompson ierr = cudaFree(impl->diag->d_interpout); CeedChk_Cu(ceed, ierr); 69*0d0321e0SJeremy L Thompson ierr = cudaFree(impl->diag->d_gradin); CeedChk_Cu(ceed, ierr); 70*0d0321e0SJeremy L Thompson ierr = cudaFree(impl->diag->d_gradout); CeedChk_Cu(ceed, ierr); 71*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr); 72*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 73*0d0321e0SJeremy L Thompson ierr = CeedVectorDestroy(&impl->diag->elemdiag); CeedChkBackend(ierr); 74*0d0321e0SJeremy L Thompson ierr = CeedVectorDestroy(&impl->diag->pbelemdiag); CeedChkBackend(ierr); 75*0d0321e0SJeremy L Thompson } 76*0d0321e0SJeremy L Thompson ierr = CeedFree(&impl->diag); CeedChkBackend(ierr); 77*0d0321e0SJeremy L Thompson 78*0d0321e0SJeremy L Thompson ierr = CeedFree(&impl); CeedChkBackend(ierr); 79*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 80*0d0321e0SJeremy L Thompson } 81*0d0321e0SJeremy L Thompson 82*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 83*0d0321e0SJeremy L Thompson // Setup infields or outfields 84*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 85*0d0321e0SJeremy L Thompson static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, 86*0d0321e0SJeremy L Thompson bool isinput, CeedVector *evecs, 87*0d0321e0SJeremy L Thompson CeedVector *qvecs, CeedInt starte, 88*0d0321e0SJeremy L Thompson CeedInt numfields, CeedInt Q, 89*0d0321e0SJeremy L Thompson CeedInt numelements) { 90*0d0321e0SJeremy L Thompson CeedInt dim, ierr, size; 91*0d0321e0SJeremy L Thompson Ceed ceed; 92*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 93*0d0321e0SJeremy L Thompson CeedBasis basis; 94*0d0321e0SJeremy L Thompson CeedElemRestriction Erestrict; 95*0d0321e0SJeremy L Thompson CeedOperatorField *opfields; 96*0d0321e0SJeremy L Thompson CeedQFunctionField *qffields; 97*0d0321e0SJeremy L Thompson CeedVector fieldvec; 98*0d0321e0SJeremy L Thompson bool strided; 99*0d0321e0SJeremy L Thompson bool skiprestrict; 100*0d0321e0SJeremy L Thompson 101*0d0321e0SJeremy L Thompson if (isinput) { 102*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL); 103*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 104*0d0321e0SJeremy L Thompson ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL); 105*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 106*0d0321e0SJeremy L Thompson } else { 107*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields); 108*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 109*0d0321e0SJeremy L Thompson ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields); 110*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 111*0d0321e0SJeremy L Thompson } 112*0d0321e0SJeremy L Thompson 113*0d0321e0SJeremy L Thompson // Loop over fields 114*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numfields; i++) { 115*0d0321e0SJeremy L Thompson CeedEvalMode emode; 116*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr); 117*0d0321e0SJeremy L Thompson 118*0d0321e0SJeremy L Thompson strided = false; 119*0d0321e0SJeremy L Thompson skiprestrict = false; 120*0d0321e0SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { 121*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict); 122*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 123*0d0321e0SJeremy L Thompson 124*0d0321e0SJeremy L Thompson // Check whether this field can skip the element restriction: 125*0d0321e0SJeremy L Thompson // must be passive input, with emode NONE, and have a strided restriction with 126*0d0321e0SJeremy L Thompson // CEED_STRIDES_BACKEND. 127*0d0321e0SJeremy L Thompson 128*0d0321e0SJeremy L Thompson // First, check whether the field is input or output: 129*0d0321e0SJeremy L Thompson if (isinput) { 130*0d0321e0SJeremy L Thompson // Check for passive input: 131*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opfields[i], &fieldvec); CeedChkBackend(ierr); 132*0d0321e0SJeremy L Thompson if (fieldvec != CEED_VECTOR_ACTIVE) { 133*0d0321e0SJeremy L Thompson // Check emode 134*0d0321e0SJeremy L Thompson if (emode == CEED_EVAL_NONE) { 135*0d0321e0SJeremy L Thompson // Check for strided restriction 136*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &strided); 137*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 138*0d0321e0SJeremy L Thompson if (strided) { 139*0d0321e0SJeremy L Thompson // Check if vector is already in preferred backend ordering 140*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, 141*0d0321e0SJeremy L Thompson &skiprestrict); CeedChkBackend(ierr); 142*0d0321e0SJeremy L Thompson } 143*0d0321e0SJeremy L Thompson } 144*0d0321e0SJeremy L Thompson } 145*0d0321e0SJeremy L Thompson } 146*0d0321e0SJeremy L Thompson if (skiprestrict) { 147*0d0321e0SJeremy L Thompson // We do not need an E-Vector, but will use the input field vector's data 148*0d0321e0SJeremy L Thompson // directly in the operator application. 149*0d0321e0SJeremy L Thompson evecs[i + starte] = NULL; 150*0d0321e0SJeremy L Thompson } else { 151*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionCreateVector(Erestrict, NULL, 152*0d0321e0SJeremy L Thompson &evecs[i + starte]); 153*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 154*0d0321e0SJeremy L Thompson } 155*0d0321e0SJeremy L Thompson } 156*0d0321e0SJeremy L Thompson 157*0d0321e0SJeremy L Thompson switch (emode) { 158*0d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 159*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr); 160*0d0321e0SJeremy L Thompson ierr = CeedVectorCreate(ceed, numelements * Q * size, &qvecs[i]); 161*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 162*0d0321e0SJeremy L Thompson break; 163*0d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 164*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr); 165*0d0321e0SJeremy L Thompson ierr = CeedVectorCreate(ceed, numelements * Q * size, &qvecs[i]); 166*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 167*0d0321e0SJeremy L Thompson break; 168*0d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 169*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr); 170*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr); 171*0d0321e0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 172*0d0321e0SJeremy L Thompson ierr = CeedVectorCreate(ceed, numelements * Q * size, &qvecs[i]); 173*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 174*0d0321e0SJeremy L Thompson break; 175*0d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: // Only on input fields 176*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr); 177*0d0321e0SJeremy L Thompson ierr = CeedVectorCreate(ceed, numelements * Q, &qvecs[i]); CeedChkBackend(ierr); 178*0d0321e0SJeremy L Thompson ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, 179*0d0321e0SJeremy L Thompson CEED_EVAL_WEIGHT, NULL, qvecs[i]); CeedChkBackend(ierr); 180*0d0321e0SJeremy L Thompson break; 181*0d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 182*0d0321e0SJeremy L Thompson break; // TODO: Not implemented 183*0d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 184*0d0321e0SJeremy L Thompson break; // TODO: Not implemented 185*0d0321e0SJeremy L Thompson } 186*0d0321e0SJeremy L Thompson } 187*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 188*0d0321e0SJeremy L Thompson } 189*0d0321e0SJeremy L Thompson 190*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 191*0d0321e0SJeremy L Thompson // CeedOperator needs to connect all the named fields (be they active or passive) 192*0d0321e0SJeremy L Thompson // to the named inputs and outputs of its CeedQFunction. 193*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 194*0d0321e0SJeremy L Thompson static int CeedOperatorSetup_Cuda(CeedOperator op) { 195*0d0321e0SJeremy L Thompson int ierr; 196*0d0321e0SJeremy L Thompson bool setupdone; 197*0d0321e0SJeremy L Thompson ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); 198*0d0321e0SJeremy L Thompson if (setupdone) 199*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 200*0d0321e0SJeremy L Thompson Ceed ceed; 201*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 202*0d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 203*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 204*0d0321e0SJeremy L Thompson CeedQFunction qf; 205*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 206*0d0321e0SJeremy L Thompson CeedInt Q, numelements, numinputfields, numoutputfields; 207*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 208*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 209*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 210*0d0321e0SJeremy L Thompson CeedOperatorField *opinputfields, *opoutputfields; 211*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, 212*0d0321e0SJeremy L Thompson &numoutputfields, &opoutputfields); 213*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 214*0d0321e0SJeremy L Thompson CeedQFunctionField *qfinputfields, *qfoutputfields; 215*0d0321e0SJeremy L Thompson ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 216*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 217*0d0321e0SJeremy L Thompson 218*0d0321e0SJeremy L Thompson // Allocate 219*0d0321e0SJeremy L Thompson ierr = CeedCalloc(numinputfields + numoutputfields, &impl->evecs); 220*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 221*0d0321e0SJeremy L Thompson 222*0d0321e0SJeremy L Thompson ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin); CeedChkBackend(ierr); 223*0d0321e0SJeremy L Thompson ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout); CeedChkBackend(ierr); 224*0d0321e0SJeremy L Thompson 225*0d0321e0SJeremy L Thompson impl->numein = numinputfields; impl->numeout = numoutputfields; 226*0d0321e0SJeremy L Thompson 227*0d0321e0SJeremy L Thompson // Set up infield and outfield evecs and qvecs 228*0d0321e0SJeremy L Thompson // Infields 229*0d0321e0SJeremy L Thompson ierr = CeedOperatorSetupFields_Cuda(qf, op, true, 230*0d0321e0SJeremy L Thompson impl->evecs, impl->qvecsin, 0, 231*0d0321e0SJeremy L Thompson numinputfields, Q, numelements); 232*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 233*0d0321e0SJeremy L Thompson 234*0d0321e0SJeremy L Thompson // Outfields 235*0d0321e0SJeremy L Thompson ierr = CeedOperatorSetupFields_Cuda(qf, op, false, 236*0d0321e0SJeremy L Thompson impl->evecs, impl->qvecsout, 237*0d0321e0SJeremy L Thompson numinputfields, numoutputfields, Q, 238*0d0321e0SJeremy L Thompson numelements); CeedChkBackend(ierr); 239*0d0321e0SJeremy L Thompson 240*0d0321e0SJeremy L Thompson ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 241*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 242*0d0321e0SJeremy L Thompson } 243*0d0321e0SJeremy L Thompson 244*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 245*0d0321e0SJeremy L Thompson // Setup Operator Inputs 246*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 247*0d0321e0SJeremy L Thompson static inline int CeedOperatorSetupInputs_Cuda(CeedInt numinputfields, 248*0d0321e0SJeremy L Thompson CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, 249*0d0321e0SJeremy L Thompson CeedVector invec, const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX], 250*0d0321e0SJeremy L Thompson CeedOperator_Cuda *impl, CeedRequest *request) { 251*0d0321e0SJeremy L Thompson CeedInt ierr; 252*0d0321e0SJeremy L Thompson CeedEvalMode emode; 253*0d0321e0SJeremy L Thompson CeedVector vec; 254*0d0321e0SJeremy L Thompson CeedElemRestriction Erestrict; 255*0d0321e0SJeremy L Thompson 256*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 257*0d0321e0SJeremy L Thompson // Get input vector 258*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 259*0d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 260*0d0321e0SJeremy L Thompson if (skipactive) 261*0d0321e0SJeremy L Thompson continue; 262*0d0321e0SJeremy L Thompson else 263*0d0321e0SJeremy L Thompson vec = invec; 264*0d0321e0SJeremy L Thompson } 265*0d0321e0SJeremy L Thompson 266*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 267*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 268*0d0321e0SJeremy L Thompson if (emode == CEED_EVAL_WEIGHT) { // Skip 269*0d0321e0SJeremy L Thompson } else { 270*0d0321e0SJeremy L Thompson // Get input vector 271*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 272*0d0321e0SJeremy L Thompson // Get input element restriction 273*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 274*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 275*0d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) 276*0d0321e0SJeremy L Thompson vec = invec; 277*0d0321e0SJeremy L Thompson // Restrict, if necessary 278*0d0321e0SJeremy L Thompson if (!impl->evecs[i]) { 279*0d0321e0SJeremy L Thompson // No restriction for this field; read data directly from vec. 280*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, 281*0d0321e0SJeremy L Thompson (const CeedScalar **) &edata[i]); 282*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 283*0d0321e0SJeremy L Thompson } else { 284*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec, 285*0d0321e0SJeremy L Thompson impl->evecs[i], request); CeedChkBackend(ierr); 286*0d0321e0SJeremy L Thompson // Get evec 287*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE, 288*0d0321e0SJeremy L Thompson (const CeedScalar **) &edata[i]); 289*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 290*0d0321e0SJeremy L Thompson } 291*0d0321e0SJeremy L Thompson } 292*0d0321e0SJeremy L Thompson } 293*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 294*0d0321e0SJeremy L Thompson } 295*0d0321e0SJeremy L Thompson 296*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 297*0d0321e0SJeremy L Thompson // Input Basis Action 298*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 299*0d0321e0SJeremy L Thompson static inline int CeedOperatorInputBasis_Cuda(CeedInt numelements, 300*0d0321e0SJeremy L Thompson CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, 301*0d0321e0SJeremy L Thompson CeedInt numinputfields, const bool skipactive, 302*0d0321e0SJeremy L Thompson CeedScalar *edata[2*CEED_FIELD_MAX],CeedOperator_Cuda *impl) { 303*0d0321e0SJeremy L Thompson CeedInt ierr; 304*0d0321e0SJeremy L Thompson CeedInt elemsize, size; 305*0d0321e0SJeremy L Thompson CeedElemRestriction Erestrict; 306*0d0321e0SJeremy L Thompson CeedEvalMode emode; 307*0d0321e0SJeremy L Thompson CeedBasis basis; 308*0d0321e0SJeremy L Thompson 309*0d0321e0SJeremy L Thompson for (CeedInt i=0; i<numinputfields; i++) { 310*0d0321e0SJeremy L Thompson // Skip active input 311*0d0321e0SJeremy L Thompson if (skipactive) { 312*0d0321e0SJeremy L Thompson CeedVector vec; 313*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 314*0d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) 315*0d0321e0SJeremy L Thompson continue; 316*0d0321e0SJeremy L Thompson } 317*0d0321e0SJeremy L Thompson // Get elemsize, emode, size 318*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 319*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 320*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 321*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 322*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 323*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 324*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr); 325*0d0321e0SJeremy L Thompson // Basis action 326*0d0321e0SJeremy L Thompson switch (emode) { 327*0d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 328*0d0321e0SJeremy L Thompson ierr = CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE, 329*0d0321e0SJeremy L Thompson CEED_USE_POINTER, edata[i]); CeedChkBackend(ierr); 330*0d0321e0SJeremy L Thompson break; 331*0d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 332*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); 333*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 334*0d0321e0SJeremy L Thompson ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, 335*0d0321e0SJeremy L Thompson CEED_EVAL_INTERP, impl->evecs[i], 336*0d0321e0SJeremy L Thompson impl->qvecsin[i]); CeedChkBackend(ierr); 337*0d0321e0SJeremy L Thompson break; 338*0d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 339*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); 340*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 341*0d0321e0SJeremy L Thompson ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, 342*0d0321e0SJeremy L Thompson CEED_EVAL_GRAD, impl->evecs[i], 343*0d0321e0SJeremy L Thompson impl->qvecsin[i]); CeedChkBackend(ierr); 344*0d0321e0SJeremy L Thompson break; 345*0d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: 346*0d0321e0SJeremy L Thompson break; // No action 347*0d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 348*0d0321e0SJeremy L Thompson break; // TODO: Not implemented 349*0d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 350*0d0321e0SJeremy L Thompson break; // TODO: Not implemented 351*0d0321e0SJeremy L Thompson } 352*0d0321e0SJeremy L Thompson } 353*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 354*0d0321e0SJeremy L Thompson } 355*0d0321e0SJeremy L Thompson 356*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 357*0d0321e0SJeremy L Thompson // Restore Input Vectors 358*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 359*0d0321e0SJeremy L Thompson static inline int CeedOperatorRestoreInputs_Cuda(CeedInt numinputfields, 360*0d0321e0SJeremy L Thompson CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, 361*0d0321e0SJeremy L Thompson const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX], 362*0d0321e0SJeremy L Thompson CeedOperator_Cuda *impl) { 363*0d0321e0SJeremy L Thompson CeedInt ierr; 364*0d0321e0SJeremy L Thompson CeedEvalMode emode; 365*0d0321e0SJeremy L Thompson CeedVector vec; 366*0d0321e0SJeremy L Thompson 367*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 368*0d0321e0SJeremy L Thompson // Skip active input 369*0d0321e0SJeremy L Thompson if (skipactive) { 370*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 371*0d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) 372*0d0321e0SJeremy L Thompson continue; 373*0d0321e0SJeremy L Thompson } 374*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 375*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 376*0d0321e0SJeremy L Thompson if (emode == CEED_EVAL_WEIGHT) { // Skip 377*0d0321e0SJeremy L Thompson } else { 378*0d0321e0SJeremy L Thompson if (!impl->evecs[i]) { // This was a skiprestrict case 379*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 380*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(vec, 381*0d0321e0SJeremy L Thompson (const CeedScalar **)&edata[i]); 382*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 383*0d0321e0SJeremy L Thompson } else { 384*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(impl->evecs[i], 385*0d0321e0SJeremy L Thompson (const CeedScalar **) &edata[i]); 386*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 387*0d0321e0SJeremy L Thompson } 388*0d0321e0SJeremy L Thompson } 389*0d0321e0SJeremy L Thompson } 390*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 391*0d0321e0SJeremy L Thompson } 392*0d0321e0SJeremy L Thompson 393*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 394*0d0321e0SJeremy L Thompson // Apply and add to output 395*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 396*0d0321e0SJeremy L Thompson static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector invec, 397*0d0321e0SJeremy L Thompson CeedVector outvec, CeedRequest *request) { 398*0d0321e0SJeremy L Thompson int ierr; 399*0d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 400*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 401*0d0321e0SJeremy L Thompson CeedQFunction qf; 402*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 403*0d0321e0SJeremy L Thompson CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size; 404*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 405*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 406*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 407*0d0321e0SJeremy L Thompson CeedOperatorField *opinputfields, *opoutputfields; 408*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, 409*0d0321e0SJeremy L Thompson &numoutputfields, &opoutputfields); 410*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 411*0d0321e0SJeremy L Thompson CeedQFunctionField *qfinputfields, *qfoutputfields; 412*0d0321e0SJeremy L Thompson ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 413*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 414*0d0321e0SJeremy L Thompson CeedEvalMode emode; 415*0d0321e0SJeremy L Thompson CeedVector vec; 416*0d0321e0SJeremy L Thompson CeedBasis basis; 417*0d0321e0SJeremy L Thompson CeedElemRestriction Erestrict; 418*0d0321e0SJeremy L Thompson CeedScalar *edata[2*CEED_FIELD_MAX] = {0}; 419*0d0321e0SJeremy L Thompson 420*0d0321e0SJeremy L Thompson // Setup 421*0d0321e0SJeremy L Thompson ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr); 422*0d0321e0SJeremy L Thompson 423*0d0321e0SJeremy L Thompson // Input Evecs and Restriction 424*0d0321e0SJeremy L Thompson ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields, 425*0d0321e0SJeremy L Thompson opinputfields, invec, false, edata, 426*0d0321e0SJeremy L Thompson impl, request); CeedChkBackend(ierr); 427*0d0321e0SJeremy L Thompson 428*0d0321e0SJeremy L Thompson // Input basis apply if needed 429*0d0321e0SJeremy L Thompson ierr = CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields, 430*0d0321e0SJeremy L Thompson numinputfields, false, edata, impl); 431*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 432*0d0321e0SJeremy L Thompson 433*0d0321e0SJeremy L Thompson // Output pointers, as necessary 434*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 435*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 436*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 437*0d0321e0SJeremy L Thompson if (emode == CEED_EVAL_NONE) { 438*0d0321e0SJeremy L Thompson // Set the output Q-Vector to use the E-Vector data directly. 439*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE, 440*0d0321e0SJeremy L Thompson &edata[i + numinputfields]); CeedChkBackend(ierr); 441*0d0321e0SJeremy L Thompson ierr = CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE, 442*0d0321e0SJeremy L Thompson CEED_USE_POINTER, edata[i + numinputfields]); 443*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 444*0d0321e0SJeremy L Thompson } 445*0d0321e0SJeremy L Thompson } 446*0d0321e0SJeremy L Thompson 447*0d0321e0SJeremy L Thompson // Q function 448*0d0321e0SJeremy L Thompson ierr = CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout); 449*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 450*0d0321e0SJeremy L Thompson 451*0d0321e0SJeremy L Thompson // Output basis apply if needed 452*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 453*0d0321e0SJeremy L Thompson // Get elemsize, emode, size 454*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 455*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 456*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 457*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 458*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 459*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 460*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size); 461*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 462*0d0321e0SJeremy L Thompson // Basis action 463*0d0321e0SJeremy L Thompson switch (emode) { 464*0d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 465*0d0321e0SJeremy L Thompson break; 466*0d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 467*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); 468*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 469*0d0321e0SJeremy L Thompson ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE, 470*0d0321e0SJeremy L Thompson CEED_EVAL_INTERP, impl->qvecsout[i], 471*0d0321e0SJeremy L Thompson impl->evecs[i + impl->numein]); CeedChkBackend(ierr); 472*0d0321e0SJeremy L Thompson break; 473*0d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 474*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); 475*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 476*0d0321e0SJeremy L Thompson ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE, 477*0d0321e0SJeremy L Thompson CEED_EVAL_GRAD, impl->qvecsout[i], 478*0d0321e0SJeremy L Thompson impl->evecs[i + impl->numein]); CeedChkBackend(ierr); 479*0d0321e0SJeremy L Thompson break; 480*0d0321e0SJeremy L Thompson // LCOV_EXCL_START 481*0d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 482*0d0321e0SJeremy L Thompson Ceed ceed; 483*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 484*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 485*0d0321e0SJeremy L Thompson "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 486*0d0321e0SJeremy L Thompson break; // Should not occur 487*0d0321e0SJeremy L Thompson } 488*0d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 489*0d0321e0SJeremy L Thompson break; // TODO: Not implemented 490*0d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 491*0d0321e0SJeremy L Thompson break; // TODO: Not implemented 492*0d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 493*0d0321e0SJeremy L Thompson } 494*0d0321e0SJeremy L Thompson } 495*0d0321e0SJeremy L Thompson 496*0d0321e0SJeremy L Thompson // Output restriction 497*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 498*0d0321e0SJeremy L Thompson // Restore evec 499*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 500*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 501*0d0321e0SJeremy L Thompson if (emode == CEED_EVAL_NONE) { 502*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArray(impl->evecs[i+impl->numein], 503*0d0321e0SJeremy L Thompson &edata[i + numinputfields]); 504*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 505*0d0321e0SJeremy L Thompson } 506*0d0321e0SJeremy L Thompson // Get output vector 507*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); 508*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 509*0d0321e0SJeremy L Thompson // Restrict 510*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 511*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 512*0d0321e0SJeremy L Thompson // Active 513*0d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) 514*0d0321e0SJeremy L Thompson vec = outvec; 515*0d0321e0SJeremy L Thompson 516*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE, 517*0d0321e0SJeremy L Thompson impl->evecs[i + impl->numein], vec, 518*0d0321e0SJeremy L Thompson request); CeedChkBackend(ierr); 519*0d0321e0SJeremy L Thompson } 520*0d0321e0SJeremy L Thompson 521*0d0321e0SJeremy L Thompson // Restore input arrays 522*0d0321e0SJeremy L Thompson ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields, 523*0d0321e0SJeremy L Thompson opinputfields, false, edata, impl); 524*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 525*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 526*0d0321e0SJeremy L Thompson } 527*0d0321e0SJeremy L Thompson 528*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 529*0d0321e0SJeremy L Thompson // Core code for assembling linear QFunction 530*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 531*0d0321e0SJeremy L Thompson static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, 532*0d0321e0SJeremy L Thompson bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, 533*0d0321e0SJeremy L Thompson CeedRequest *request) { 534*0d0321e0SJeremy L Thompson int ierr; 535*0d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 536*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 537*0d0321e0SJeremy L Thompson CeedQFunction qf; 538*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 539*0d0321e0SJeremy L Thompson CeedInt Q, numelements, numinputfields, numoutputfields, size; 540*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 541*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 542*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 543*0d0321e0SJeremy L Thompson CeedOperatorField *opinputfields, *opoutputfields; 544*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, 545*0d0321e0SJeremy L Thompson &numoutputfields, &opoutputfields); 546*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 547*0d0321e0SJeremy L Thompson CeedQFunctionField *qfinputfields, *qfoutputfields; 548*0d0321e0SJeremy L Thompson ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 549*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 550*0d0321e0SJeremy L Thompson CeedVector vec; 551*0d0321e0SJeremy L Thompson CeedInt numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout; 552*0d0321e0SJeremy L Thompson CeedVector *activein = impl->qfactivein; 553*0d0321e0SJeremy L Thompson CeedScalar *a, *tmp; 554*0d0321e0SJeremy L Thompson Ceed ceed, ceedparent; 555*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 556*0d0321e0SJeremy L Thompson ierr = CeedGetOperatorFallbackParentCeed(ceed, &ceedparent); 557*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 558*0d0321e0SJeremy L Thompson ceedparent = ceedparent ? ceedparent : ceed; 559*0d0321e0SJeremy L Thompson CeedScalar *edata[2*CEED_FIELD_MAX]; 560*0d0321e0SJeremy L Thompson 561*0d0321e0SJeremy L Thompson // Setup 562*0d0321e0SJeremy L Thompson ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr); 563*0d0321e0SJeremy L Thompson 564*0d0321e0SJeremy L Thompson // Check for identity 565*0d0321e0SJeremy L Thompson bool identityqf; 566*0d0321e0SJeremy L Thompson ierr = CeedQFunctionIsIdentity(qf, &identityqf); CeedChkBackend(ierr); 567*0d0321e0SJeremy L Thompson if (identityqf) 568*0d0321e0SJeremy L Thompson // LCOV_EXCL_START 569*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 570*0d0321e0SJeremy L Thompson "Assembling identity QFunctions not supported"); 571*0d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 572*0d0321e0SJeremy L Thompson 573*0d0321e0SJeremy L Thompson // Input Evecs and Restriction 574*0d0321e0SJeremy L Thompson ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields, 575*0d0321e0SJeremy L Thompson opinputfields, NULL, true, edata, 576*0d0321e0SJeremy L Thompson impl, request); CeedChkBackend(ierr); 577*0d0321e0SJeremy L Thompson 578*0d0321e0SJeremy L Thompson // Count number of active input fields 579*0d0321e0SJeremy L Thompson if (!numactivein) { 580*0d0321e0SJeremy L Thompson for (CeedInt i=0; i<numinputfields; i++) { 581*0d0321e0SJeremy L Thompson // Get input vector 582*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); 583*0d0321e0SJeremy L Thompson // Check if active input 584*0d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 585*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr); 586*0d0321e0SJeremy L Thompson ierr = CeedVectorSetValue(impl->qvecsin[i], 0.0); CeedChkBackend(ierr); 587*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp); 588*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 589*0d0321e0SJeremy L Thompson ierr = CeedRealloc(numactivein + size, &activein); CeedChkBackend(ierr); 590*0d0321e0SJeremy L Thompson for (CeedInt field = 0; field < size; field++) { 591*0d0321e0SJeremy L Thompson ierr = CeedVectorCreate(ceed, Q*numelements, 592*0d0321e0SJeremy L Thompson &activein[numactivein+field]); CeedChkBackend(ierr); 593*0d0321e0SJeremy L Thompson ierr = CeedVectorSetArray(activein[numactivein+field], CEED_MEM_DEVICE, 594*0d0321e0SJeremy L Thompson CEED_USE_POINTER, &tmp[field*Q*numelements]); 595*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 596*0d0321e0SJeremy L Thompson } 597*0d0321e0SJeremy L Thompson numactivein += size; 598*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArray(impl->qvecsin[i], &tmp); CeedChkBackend(ierr); 599*0d0321e0SJeremy L Thompson } 600*0d0321e0SJeremy L Thompson } 601*0d0321e0SJeremy L Thompson impl->qfnumactivein = numactivein; 602*0d0321e0SJeremy L Thompson impl->qfactivein = activein; 603*0d0321e0SJeremy L Thompson } 604*0d0321e0SJeremy L Thompson 605*0d0321e0SJeremy L Thompson // Count number of active output fields 606*0d0321e0SJeremy L Thompson if (!numactiveout) { 607*0d0321e0SJeremy L Thompson for (CeedInt i=0; i<numoutputfields; i++) { 608*0d0321e0SJeremy L Thompson // Get output vector 609*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); 610*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 611*0d0321e0SJeremy L Thompson // Check if active output 612*0d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 613*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size); 614*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 615*0d0321e0SJeremy L Thompson numactiveout += size; 616*0d0321e0SJeremy L Thompson } 617*0d0321e0SJeremy L Thompson } 618*0d0321e0SJeremy L Thompson impl->qfnumactiveout = numactiveout; 619*0d0321e0SJeremy L Thompson } 620*0d0321e0SJeremy L Thompson 621*0d0321e0SJeremy L Thompson // Check sizes 622*0d0321e0SJeremy L Thompson if (!numactivein || !numactiveout) 623*0d0321e0SJeremy L Thompson // LCOV_EXCL_START 624*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 625*0d0321e0SJeremy L Thompson "Cannot assemble QFunction without active inputs " 626*0d0321e0SJeremy L Thompson "and outputs"); 627*0d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 628*0d0321e0SJeremy L Thompson 629*0d0321e0SJeremy L Thompson // Build objects if needed 630*0d0321e0SJeremy L Thompson if (build_objects) { 631*0d0321e0SJeremy L Thompson // Create output restriction 632*0d0321e0SJeremy L Thompson CeedInt strides[3] = {1, numelements*Q, Q}; /* *NOPAD* */ 633*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionCreateStrided(ceedparent, numelements, Q, 634*0d0321e0SJeremy L Thompson numactivein*numactiveout, 635*0d0321e0SJeremy L Thompson numactivein*numactiveout*numelements*Q, 636*0d0321e0SJeremy L Thompson strides, rstr); CeedChkBackend(ierr); 637*0d0321e0SJeremy L Thompson // Create assembled vector 638*0d0321e0SJeremy L Thompson ierr = CeedVectorCreate(ceedparent, numelements*Q*numactivein*numactiveout, 639*0d0321e0SJeremy L Thompson assembled); CeedChkBackend(ierr); 640*0d0321e0SJeremy L Thompson } 641*0d0321e0SJeremy L Thompson ierr = CeedVectorSetValue(*assembled, 0.0); CeedChkBackend(ierr); 642*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &a); 643*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 644*0d0321e0SJeremy L Thompson 645*0d0321e0SJeremy L Thompson // Input basis apply 646*0d0321e0SJeremy L Thompson ierr = CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields, 647*0d0321e0SJeremy L Thompson numinputfields, true, edata, impl); 648*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 649*0d0321e0SJeremy L Thompson 650*0d0321e0SJeremy L Thompson // Assemble QFunction 651*0d0321e0SJeremy L Thompson for (CeedInt in=0; in<numactivein; in++) { 652*0d0321e0SJeremy L Thompson // Set Inputs 653*0d0321e0SJeremy L Thompson ierr = CeedVectorSetValue(activein[in], 1.0); CeedChkBackend(ierr); 654*0d0321e0SJeremy L Thompson if (numactivein > 1) { 655*0d0321e0SJeremy L Thompson ierr = CeedVectorSetValue(activein[(in+numactivein-1)%numactivein], 656*0d0321e0SJeremy L Thompson 0.0); CeedChkBackend(ierr); 657*0d0321e0SJeremy L Thompson } 658*0d0321e0SJeremy L Thompson // Set Outputs 659*0d0321e0SJeremy L Thompson for (CeedInt out=0; out<numoutputfields; out++) { 660*0d0321e0SJeremy L Thompson // Get output vector 661*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec); 662*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 663*0d0321e0SJeremy L Thompson // Check if active output 664*0d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 665*0d0321e0SJeremy L Thompson CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE, 666*0d0321e0SJeremy L Thompson CEED_USE_POINTER, a); CeedChkBackend(ierr); 667*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetSize(qfoutputfields[out], &size); 668*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 669*0d0321e0SJeremy L Thompson a += size*Q*numelements; // Advance the pointer by the size of the output 670*0d0321e0SJeremy L Thompson } 671*0d0321e0SJeremy L Thompson } 672*0d0321e0SJeremy L Thompson // Apply QFunction 673*0d0321e0SJeremy L Thompson ierr = CeedQFunctionApply(qf, Q*numelements, impl->qvecsin, impl->qvecsout); 674*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 675*0d0321e0SJeremy L Thompson } 676*0d0321e0SJeremy L Thompson 677*0d0321e0SJeremy L Thompson // Un-set output Qvecs to prevent accidental overwrite of Assembled 678*0d0321e0SJeremy L Thompson for (CeedInt out=0; out<numoutputfields; out++) { 679*0d0321e0SJeremy L Thompson // Get output vector 680*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec); 681*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 682*0d0321e0SJeremy L Thompson // Check if active output 683*0d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 684*0d0321e0SJeremy L Thompson ierr = CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL); 685*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 686*0d0321e0SJeremy L Thompson } 687*0d0321e0SJeremy L Thompson } 688*0d0321e0SJeremy L Thompson 689*0d0321e0SJeremy L Thompson // Restore input arrays 690*0d0321e0SJeremy L Thompson ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields, 691*0d0321e0SJeremy L Thompson opinputfields, true, edata, impl); 692*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 693*0d0321e0SJeremy L Thompson 694*0d0321e0SJeremy L Thompson // Restore output 695*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArray(*assembled, &a); CeedChkBackend(ierr); 696*0d0321e0SJeremy L Thompson 697*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 698*0d0321e0SJeremy L Thompson } 699*0d0321e0SJeremy L Thompson 700*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 701*0d0321e0SJeremy L Thompson // Assemble Linear QFunction 702*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 703*0d0321e0SJeremy L Thompson static int CeedOperatorLinearAssembleQFunction_Cuda(CeedOperator op, 704*0d0321e0SJeremy L Thompson CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { 705*0d0321e0SJeremy L Thompson return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, true, assembled, rstr, 706*0d0321e0SJeremy L Thompson request); 707*0d0321e0SJeremy L Thompson } 708*0d0321e0SJeremy L Thompson 709*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 710*0d0321e0SJeremy L Thompson // Update Assembled Linear QFunction 711*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 712*0d0321e0SJeremy L Thompson static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op, 713*0d0321e0SJeremy L Thompson CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { 714*0d0321e0SJeremy L Thompson return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, false, &assembled, 715*0d0321e0SJeremy L Thompson &rstr, request); 716*0d0321e0SJeremy L Thompson } 717*0d0321e0SJeremy L Thompson 718*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 719*0d0321e0SJeremy L Thompson // Diagonal assembly kernels 720*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 721*0d0321e0SJeremy L Thompson // *INDENT-OFF* 722*0d0321e0SJeremy L Thompson static const char *diagonalkernels = QUOTE( 723*0d0321e0SJeremy L Thompson 724*0d0321e0SJeremy L Thompson typedef enum { 725*0d0321e0SJeremy L Thompson /// Perform no evaluation (either because there is no data or it is already at 726*0d0321e0SJeremy L Thompson /// quadrature points) 727*0d0321e0SJeremy L Thompson CEED_EVAL_NONE = 0, 728*0d0321e0SJeremy L Thompson /// Interpolate from nodes to quadrature points 729*0d0321e0SJeremy L Thompson CEED_EVAL_INTERP = 1, 730*0d0321e0SJeremy L Thompson /// Evaluate gradients at quadrature points from input in a nodal basis 731*0d0321e0SJeremy L Thompson CEED_EVAL_GRAD = 2, 732*0d0321e0SJeremy L Thompson /// Evaluate divergence at quadrature points from input in a nodal basis 733*0d0321e0SJeremy L Thompson CEED_EVAL_DIV = 4, 734*0d0321e0SJeremy L Thompson /// Evaluate curl at quadrature points from input in a nodal basis 735*0d0321e0SJeremy L Thompson CEED_EVAL_CURL = 8, 736*0d0321e0SJeremy L Thompson /// Using no input, evaluate quadrature weights on the reference element 737*0d0321e0SJeremy L Thompson CEED_EVAL_WEIGHT = 16, 738*0d0321e0SJeremy L Thompson } CeedEvalMode; 739*0d0321e0SJeremy L Thompson 740*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 741*0d0321e0SJeremy L Thompson // Get Basis Emode Pointer 742*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 743*0d0321e0SJeremy L Thompson extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **basisptr, 744*0d0321e0SJeremy L Thompson CeedEvalMode emode, const CeedScalar *identity, const CeedScalar *interp, 745*0d0321e0SJeremy L Thompson const CeedScalar *grad) { 746*0d0321e0SJeremy L Thompson switch (emode) { 747*0d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 748*0d0321e0SJeremy L Thompson *basisptr = identity; 749*0d0321e0SJeremy L Thompson break; 750*0d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 751*0d0321e0SJeremy L Thompson *basisptr = interp; 752*0d0321e0SJeremy L Thompson break; 753*0d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 754*0d0321e0SJeremy L Thompson *basisptr = grad; 755*0d0321e0SJeremy L Thompson break; 756*0d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: 757*0d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 758*0d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 759*0d0321e0SJeremy L Thompson break; // Caught by QF Assembly 760*0d0321e0SJeremy L Thompson } 761*0d0321e0SJeremy L Thompson } 762*0d0321e0SJeremy L Thompson 763*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 764*0d0321e0SJeremy L Thompson // Core code for diagonal assembly 765*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 766*0d0321e0SJeremy L Thompson __device__ void diagonalCore(const CeedInt nelem, 767*0d0321e0SJeremy L Thompson const CeedScalar maxnorm, const bool pointBlock, 768*0d0321e0SJeremy L Thompson const CeedScalar *identity, 769*0d0321e0SJeremy L Thompson const CeedScalar *interpin, const CeedScalar *gradin, 770*0d0321e0SJeremy L Thompson const CeedScalar *interpout, const CeedScalar *gradout, 771*0d0321e0SJeremy L Thompson const CeedEvalMode *emodein, const CeedEvalMode *emodeout, 772*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ assembledqfarray, 773*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ elemdiagarray) { 774*0d0321e0SJeremy L Thompson const int tid = threadIdx.x; // running with P threads, tid is evec node 775*0d0321e0SJeremy L Thompson const CeedScalar qfvaluebound = maxnorm*1e-12; 776*0d0321e0SJeremy L Thompson 777*0d0321e0SJeremy L Thompson // Compute the diagonal of B^T D B 778*0d0321e0SJeremy L Thompson // Each element 779*0d0321e0SJeremy L Thompson for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < nelem; 780*0d0321e0SJeremy L Thompson e += gridDim.x*blockDim.z) { 781*0d0321e0SJeremy L Thompson CeedInt dout = -1; 782*0d0321e0SJeremy L Thompson // Each basis eval mode pair 783*0d0321e0SJeremy L Thompson for (CeedInt eout = 0; eout < NUMEMODEOUT; eout++) { 784*0d0321e0SJeremy L Thompson const CeedScalar *bt = NULL; 785*0d0321e0SJeremy L Thompson if (emodeout[eout] == CEED_EVAL_GRAD) 786*0d0321e0SJeremy L Thompson dout += 1; 787*0d0321e0SJeremy L Thompson CeedOperatorGetBasisPointer_Cuda(&bt, emodeout[eout], identity, interpout, 788*0d0321e0SJeremy L Thompson &gradout[dout*NQPTS*NNODES]); 789*0d0321e0SJeremy L Thompson CeedInt din = -1; 790*0d0321e0SJeremy L Thompson for (CeedInt ein = 0; ein < NUMEMODEIN; ein++) { 791*0d0321e0SJeremy L Thompson const CeedScalar *b = NULL; 792*0d0321e0SJeremy L Thompson if (emodein[ein] == CEED_EVAL_GRAD) 793*0d0321e0SJeremy L Thompson din += 1; 794*0d0321e0SJeremy L Thompson CeedOperatorGetBasisPointer_Cuda(&b, emodein[ein], identity, interpin, 795*0d0321e0SJeremy L Thompson &gradin[din*NQPTS*NNODES]); 796*0d0321e0SJeremy L Thompson // Each component 797*0d0321e0SJeremy L Thompson for (CeedInt compOut = 0; compOut < NCOMP; compOut++) { 798*0d0321e0SJeremy L Thompson // Each qpoint/node pair 799*0d0321e0SJeremy L Thompson if (pointBlock) { 800*0d0321e0SJeremy L Thompson // Point Block Diagonal 801*0d0321e0SJeremy L Thompson for (CeedInt compIn = 0; compIn < NCOMP; compIn++) { 802*0d0321e0SJeremy L Thompson CeedScalar evalue = 0.; 803*0d0321e0SJeremy L Thompson for (CeedInt q = 0; q < NQPTS; q++) { 804*0d0321e0SJeremy L Thompson const CeedScalar qfvalue = 805*0d0321e0SJeremy L Thompson assembledqfarray[((((ein*NCOMP+compIn)*NUMEMODEOUT+eout)* 806*0d0321e0SJeremy L Thompson NCOMP+compOut)*nelem+e)*NQPTS+q]; 807*0d0321e0SJeremy L Thompson if (abs(qfvalue) > qfvaluebound) 808*0d0321e0SJeremy L Thompson evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid]; 809*0d0321e0SJeremy L Thompson } 810*0d0321e0SJeremy L Thompson elemdiagarray[((compOut*NCOMP+compIn)*nelem+e)*NNODES+tid] += evalue; 811*0d0321e0SJeremy L Thompson } 812*0d0321e0SJeremy L Thompson } else { 813*0d0321e0SJeremy L Thompson // Diagonal Only 814*0d0321e0SJeremy L Thompson CeedScalar evalue = 0.; 815*0d0321e0SJeremy L Thompson for (CeedInt q = 0; q < NQPTS; q++) { 816*0d0321e0SJeremy L Thompson const CeedScalar qfvalue = 817*0d0321e0SJeremy L Thompson assembledqfarray[((((ein*NCOMP+compOut)*NUMEMODEOUT+eout)* 818*0d0321e0SJeremy L Thompson NCOMP+compOut)*nelem+e)*NQPTS+q]; 819*0d0321e0SJeremy L Thompson if (abs(qfvalue) > qfvaluebound) 820*0d0321e0SJeremy L Thompson evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid]; 821*0d0321e0SJeremy L Thompson } 822*0d0321e0SJeremy L Thompson elemdiagarray[(compOut*nelem+e)*NNODES+tid] += evalue; 823*0d0321e0SJeremy L Thompson } 824*0d0321e0SJeremy L Thompson } 825*0d0321e0SJeremy L Thompson } 826*0d0321e0SJeremy L Thompson } 827*0d0321e0SJeremy L Thompson } 828*0d0321e0SJeremy L Thompson } 829*0d0321e0SJeremy L Thompson 830*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 831*0d0321e0SJeremy L Thompson // Linear diagonal 832*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 833*0d0321e0SJeremy L Thompson extern "C" __global__ void linearDiagonal(const CeedInt nelem, 834*0d0321e0SJeremy L Thompson const CeedScalar maxnorm, const CeedScalar *identity, 835*0d0321e0SJeremy L Thompson const CeedScalar *interpin, const CeedScalar *gradin, 836*0d0321e0SJeremy L Thompson const CeedScalar *interpout, const CeedScalar *gradout, 837*0d0321e0SJeremy L Thompson const CeedEvalMode *emodein, const CeedEvalMode *emodeout, 838*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ assembledqfarray, 839*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ elemdiagarray) { 840*0d0321e0SJeremy L Thompson diagonalCore(nelem, maxnorm, false, identity, interpin, gradin, interpout, 841*0d0321e0SJeremy L Thompson gradout, emodein, emodeout, assembledqfarray, elemdiagarray); 842*0d0321e0SJeremy L Thompson } 843*0d0321e0SJeremy L Thompson 844*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 845*0d0321e0SJeremy L Thompson // Linear point block diagonal 846*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 847*0d0321e0SJeremy L Thompson extern "C" __global__ void linearPointBlockDiagonal(const CeedInt nelem, 848*0d0321e0SJeremy L Thompson const CeedScalar maxnorm, const CeedScalar *identity, 849*0d0321e0SJeremy L Thompson const CeedScalar *interpin, const CeedScalar *gradin, 850*0d0321e0SJeremy L Thompson const CeedScalar *interpout, const CeedScalar *gradout, 851*0d0321e0SJeremy L Thompson const CeedEvalMode *emodein, const CeedEvalMode *emodeout, 852*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ assembledqfarray, 853*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ elemdiagarray) { 854*0d0321e0SJeremy L Thompson diagonalCore(nelem, maxnorm, true, identity, interpin, gradin, interpout, 855*0d0321e0SJeremy L Thompson gradout, emodein, emodeout, assembledqfarray, elemdiagarray); 856*0d0321e0SJeremy L Thompson } 857*0d0321e0SJeremy L Thompson 858*0d0321e0SJeremy L Thompson ); 859*0d0321e0SJeremy L Thompson // *INDENT-ON* 860*0d0321e0SJeremy L Thompson 861*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 862*0d0321e0SJeremy L Thompson // Create point block restriction 863*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 864*0d0321e0SJeremy L Thompson static int CreatePBRestriction(CeedElemRestriction rstr, 865*0d0321e0SJeremy L Thompson CeedElemRestriction *pbRstr) { 866*0d0321e0SJeremy L Thompson int ierr; 867*0d0321e0SJeremy L Thompson Ceed ceed; 868*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(rstr, &ceed); CeedChkBackend(ierr); 869*0d0321e0SJeremy L Thompson const CeedInt *offsets; 870*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets); 871*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 872*0d0321e0SJeremy L Thompson 873*0d0321e0SJeremy L Thompson // Expand offsets 874*0d0321e0SJeremy L Thompson CeedInt nelem, ncomp, elemsize, compstride, max = 1, *pbOffsets; 875*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(rstr, &nelem); CeedChkBackend(ierr); 876*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetNumComponents(rstr, &ncomp); CeedChkBackend(ierr); 877*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(rstr, &elemsize); CeedChkBackend(ierr); 878*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(rstr, &compstride); 879*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 880*0d0321e0SJeremy L Thompson CeedInt shift = ncomp; 881*0d0321e0SJeremy L Thompson if (compstride != 1) 882*0d0321e0SJeremy L Thompson shift *= ncomp; 883*0d0321e0SJeremy L Thompson ierr = CeedCalloc(nelem*elemsize, &pbOffsets); CeedChkBackend(ierr); 884*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < nelem*elemsize; i++) { 885*0d0321e0SJeremy L Thompson pbOffsets[i] = offsets[i]*shift; 886*0d0321e0SJeremy L Thompson if (pbOffsets[i] > max) 887*0d0321e0SJeremy L Thompson max = pbOffsets[i]; 888*0d0321e0SJeremy L Thompson } 889*0d0321e0SJeremy L Thompson 890*0d0321e0SJeremy L Thompson // Create new restriction 891*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp*ncomp, 1, 892*0d0321e0SJeremy L Thompson max + ncomp*ncomp, CEED_MEM_HOST, 893*0d0321e0SJeremy L Thompson CEED_OWN_POINTER, pbOffsets, pbRstr); 894*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 895*0d0321e0SJeremy L Thompson 896*0d0321e0SJeremy L Thompson // Cleanup 897*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionRestoreOffsets(rstr, &offsets); CeedChkBackend(ierr); 898*0d0321e0SJeremy L Thompson 899*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 900*0d0321e0SJeremy L Thompson } 901*0d0321e0SJeremy L Thompson 902*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 903*0d0321e0SJeremy L Thompson // Assemble diagonal setup 904*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 905*0d0321e0SJeremy L Thompson static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, 906*0d0321e0SJeremy L Thompson const bool pointBlock) { 907*0d0321e0SJeremy L Thompson int ierr; 908*0d0321e0SJeremy L Thompson Ceed ceed; 909*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 910*0d0321e0SJeremy L Thompson CeedQFunction qf; 911*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 912*0d0321e0SJeremy L Thompson CeedInt numinputfields, numoutputfields; 913*0d0321e0SJeremy L Thompson ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 914*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 915*0d0321e0SJeremy L Thompson 916*0d0321e0SJeremy L Thompson // Determine active input basis 917*0d0321e0SJeremy L Thompson CeedOperatorField *opfields; 918*0d0321e0SJeremy L Thompson CeedQFunctionField *qffields; 919*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL); 920*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 921*0d0321e0SJeremy L Thompson ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL); 922*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 923*0d0321e0SJeremy L Thompson CeedInt numemodein = 0, ncomp = 0, dim = 1; 924*0d0321e0SJeremy L Thompson CeedEvalMode *emodein = NULL; 925*0d0321e0SJeremy L Thompson CeedBasis basisin = NULL; 926*0d0321e0SJeremy L Thompson CeedElemRestriction rstrin = NULL; 927*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 928*0d0321e0SJeremy L Thompson CeedVector vec; 929*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr); 930*0d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 931*0d0321e0SJeremy L Thompson CeedElemRestriction rstr; 932*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opfields[i], &basisin); CeedChkBackend(ierr); 933*0d0321e0SJeremy L Thompson ierr = CeedBasisGetNumComponents(basisin, &ncomp); CeedChkBackend(ierr); 934*0d0321e0SJeremy L Thompson ierr = CeedBasisGetDimension(basisin, &dim); CeedChkBackend(ierr); 935*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr); 936*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 937*0d0321e0SJeremy L Thompson if (rstrin && rstrin != rstr) 938*0d0321e0SJeremy L Thompson // LCOV_EXCL_START 939*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 940*0d0321e0SJeremy L Thompson "Multi-field non-composite operator diagonal assembly not supported"); 941*0d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 942*0d0321e0SJeremy L Thompson rstrin = rstr; 943*0d0321e0SJeremy L Thompson CeedEvalMode emode; 944*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); 945*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 946*0d0321e0SJeremy L Thompson switch (emode) { 947*0d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 948*0d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 949*0d0321e0SJeremy L Thompson ierr = CeedRealloc(numemodein + 1, &emodein); CeedChkBackend(ierr); 950*0d0321e0SJeremy L Thompson emodein[numemodein] = emode; 951*0d0321e0SJeremy L Thompson numemodein += 1; 952*0d0321e0SJeremy L Thompson break; 953*0d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 954*0d0321e0SJeremy L Thompson ierr = CeedRealloc(numemodein + dim, &emodein); CeedChkBackend(ierr); 955*0d0321e0SJeremy L Thompson for (CeedInt d = 0; d < dim; d++) 956*0d0321e0SJeremy L Thompson emodein[numemodein+d] = emode; 957*0d0321e0SJeremy L Thompson numemodein += dim; 958*0d0321e0SJeremy L Thompson break; 959*0d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: 960*0d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 961*0d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 962*0d0321e0SJeremy L Thompson break; // Caught by QF Assembly 963*0d0321e0SJeremy L Thompson } 964*0d0321e0SJeremy L Thompson } 965*0d0321e0SJeremy L Thompson } 966*0d0321e0SJeremy L Thompson 967*0d0321e0SJeremy L Thompson // Determine active output basis 968*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields); 969*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 970*0d0321e0SJeremy L Thompson ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields); 971*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 972*0d0321e0SJeremy L Thompson CeedInt numemodeout = 0; 973*0d0321e0SJeremy L Thompson CeedEvalMode *emodeout = NULL; 974*0d0321e0SJeremy L Thompson CeedBasis basisout = NULL; 975*0d0321e0SJeremy L Thompson CeedElemRestriction rstrout = NULL; 976*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 977*0d0321e0SJeremy L Thompson CeedVector vec; 978*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr); 979*0d0321e0SJeremy L Thompson if (vec == CEED_VECTOR_ACTIVE) { 980*0d0321e0SJeremy L Thompson CeedElemRestriction rstr; 981*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opfields[i], &basisout); CeedChkBackend(ierr); 982*0d0321e0SJeremy L Thompson ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr); 983*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 984*0d0321e0SJeremy L Thompson if (rstrout && rstrout != rstr) 985*0d0321e0SJeremy L Thompson // LCOV_EXCL_START 986*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 987*0d0321e0SJeremy L Thompson "Multi-field non-composite operator diagonal assembly not supported"); 988*0d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 989*0d0321e0SJeremy L Thompson rstrout = rstr; 990*0d0321e0SJeremy L Thompson CeedEvalMode emode; 991*0d0321e0SJeremy L Thompson ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr); 992*0d0321e0SJeremy L Thompson switch (emode) { 993*0d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 994*0d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: 995*0d0321e0SJeremy L Thompson ierr = CeedRealloc(numemodeout + 1, &emodeout); CeedChkBackend(ierr); 996*0d0321e0SJeremy L Thompson emodeout[numemodeout] = emode; 997*0d0321e0SJeremy L Thompson numemodeout += 1; 998*0d0321e0SJeremy L Thompson break; 999*0d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: 1000*0d0321e0SJeremy L Thompson ierr = CeedRealloc(numemodeout + dim, &emodeout); CeedChkBackend(ierr); 1001*0d0321e0SJeremy L Thompson for (CeedInt d = 0; d < dim; d++) 1002*0d0321e0SJeremy L Thompson emodeout[numemodeout+d] = emode; 1003*0d0321e0SJeremy L Thompson numemodeout += dim; 1004*0d0321e0SJeremy L Thompson break; 1005*0d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: 1006*0d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 1007*0d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 1008*0d0321e0SJeremy L Thompson break; // Caught by QF Assembly 1009*0d0321e0SJeremy L Thompson } 1010*0d0321e0SJeremy L Thompson } 1011*0d0321e0SJeremy L Thompson } 1012*0d0321e0SJeremy L Thompson 1013*0d0321e0SJeremy L Thompson // Operator data struct 1014*0d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 1015*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 1016*0d0321e0SJeremy L Thompson ierr = CeedCalloc(1, &impl->diag); CeedChkBackend(ierr); 1017*0d0321e0SJeremy L Thompson CeedOperatorDiag_Cuda *diag = impl->diag; 1018*0d0321e0SJeremy L Thompson diag->basisin = basisin; 1019*0d0321e0SJeremy L Thompson diag->basisout = basisout; 1020*0d0321e0SJeremy L Thompson diag->h_emodein = emodein; 1021*0d0321e0SJeremy L Thompson diag->h_emodeout = emodeout; 1022*0d0321e0SJeremy L Thompson diag->numemodein = numemodein; 1023*0d0321e0SJeremy L Thompson diag->numemodeout = numemodeout; 1024*0d0321e0SJeremy L Thompson 1025*0d0321e0SJeremy L Thompson // Assemble kernel 1026*0d0321e0SJeremy L Thompson CeedInt nnodes, nqpts; 1027*0d0321e0SJeremy L Thompson ierr = CeedBasisGetNumNodes(basisin, &nnodes); CeedChkBackend(ierr); 1028*0d0321e0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints(basisin, &nqpts); CeedChkBackend(ierr); 1029*0d0321e0SJeremy L Thompson diag->nnodes = nnodes; 1030*0d0321e0SJeremy L Thompson ierr = CeedCompileCuda(ceed, diagonalkernels, &diag->module, 5, 1031*0d0321e0SJeremy L Thompson "NUMEMODEIN", numemodein, 1032*0d0321e0SJeremy L Thompson "NUMEMODEOUT", numemodeout, 1033*0d0321e0SJeremy L Thompson "NNODES", nnodes, 1034*0d0321e0SJeremy L Thompson "NQPTS", nqpts, 1035*0d0321e0SJeremy L Thompson "NCOMP", ncomp 1036*0d0321e0SJeremy L Thompson ); CeedChk_Cu(ceed, ierr); 1037*0d0321e0SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, diag->module, "linearDiagonal", 1038*0d0321e0SJeremy L Thompson &diag->linearDiagonal); CeedChk_Cu(ceed, ierr); 1039*0d0321e0SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, diag->module, "linearPointBlockDiagonal", 1040*0d0321e0SJeremy L Thompson &diag->linearPointBlock); 1041*0d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 1042*0d0321e0SJeremy L Thompson 1043*0d0321e0SJeremy L Thompson // Basis matrices 1044*0d0321e0SJeremy L Thompson const CeedInt qBytes = nqpts * sizeof(CeedScalar); 1045*0d0321e0SJeremy L Thompson const CeedInt iBytes = qBytes * nnodes; 1046*0d0321e0SJeremy L Thompson const CeedInt gBytes = qBytes * nnodes * dim; 1047*0d0321e0SJeremy L Thompson const CeedInt eBytes = sizeof(CeedEvalMode); 1048*0d0321e0SJeremy L Thompson const CeedScalar *interpin, *interpout, *gradin, *gradout; 1049*0d0321e0SJeremy L Thompson 1050*0d0321e0SJeremy L Thompson // CEED_EVAL_NONE 1051*0d0321e0SJeremy L Thompson CeedScalar *identity = NULL; 1052*0d0321e0SJeremy L Thompson bool evalNone = false; 1053*0d0321e0SJeremy L Thompson for (CeedInt i=0; i<numemodein; i++) 1054*0d0321e0SJeremy L Thompson evalNone = evalNone || (emodein[i] == CEED_EVAL_NONE); 1055*0d0321e0SJeremy L Thompson for (CeedInt i=0; i<numemodeout; i++) 1056*0d0321e0SJeremy L Thompson evalNone = evalNone || (emodeout[i] == CEED_EVAL_NONE); 1057*0d0321e0SJeremy L Thompson if (evalNone) { 1058*0d0321e0SJeremy L Thompson ierr = CeedCalloc(nqpts*nnodes, &identity); CeedChkBackend(ierr); 1059*0d0321e0SJeremy L Thompson for (CeedInt i=0; i<(nnodes<nqpts?nnodes:nqpts); i++) 1060*0d0321e0SJeremy L Thompson identity[i*nnodes+i] = 1.0; 1061*0d0321e0SJeremy L Thompson ierr = cudaMalloc((void **)&diag->d_identity, iBytes); CeedChk_Cu(ceed, ierr); 1062*0d0321e0SJeremy L Thompson ierr = cudaMemcpy(diag->d_identity, identity, iBytes, 1063*0d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1064*0d0321e0SJeremy L Thompson } 1065*0d0321e0SJeremy L Thompson 1066*0d0321e0SJeremy L Thompson // CEED_EVAL_INTERP 1067*0d0321e0SJeremy L Thompson ierr = CeedBasisGetInterp(basisin, &interpin); CeedChkBackend(ierr); 1068*0d0321e0SJeremy L Thompson ierr = cudaMalloc((void **)&diag->d_interpin, iBytes); CeedChk_Cu(ceed, ierr); 1069*0d0321e0SJeremy L Thompson ierr = cudaMemcpy(diag->d_interpin, interpin, iBytes, 1070*0d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1071*0d0321e0SJeremy L Thompson ierr = CeedBasisGetInterp(basisout, &interpout); CeedChkBackend(ierr); 1072*0d0321e0SJeremy L Thompson ierr = cudaMalloc((void **)&diag->d_interpout, iBytes); CeedChk_Cu(ceed, ierr); 1073*0d0321e0SJeremy L Thompson ierr = cudaMemcpy(diag->d_interpout, interpout, iBytes, 1074*0d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1075*0d0321e0SJeremy L Thompson 1076*0d0321e0SJeremy L Thompson // CEED_EVAL_GRAD 1077*0d0321e0SJeremy L Thompson ierr = CeedBasisGetGrad(basisin, &gradin); CeedChkBackend(ierr); 1078*0d0321e0SJeremy L Thompson ierr = cudaMalloc((void **)&diag->d_gradin, gBytes); CeedChk_Cu(ceed, ierr); 1079*0d0321e0SJeremy L Thompson ierr = cudaMemcpy(diag->d_gradin, gradin, gBytes, 1080*0d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1081*0d0321e0SJeremy L Thompson ierr = CeedBasisGetGrad(basisout, &gradout); CeedChkBackend(ierr); 1082*0d0321e0SJeremy L Thompson ierr = cudaMalloc((void **)&diag->d_gradout, gBytes); CeedChk_Cu(ceed, ierr); 1083*0d0321e0SJeremy L Thompson ierr = cudaMemcpy(diag->d_gradout, gradout, gBytes, 1084*0d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1085*0d0321e0SJeremy L Thompson 1086*0d0321e0SJeremy L Thompson // Arrays of emodes 1087*0d0321e0SJeremy L Thompson ierr = cudaMalloc((void **)&diag->d_emodein, numemodein * eBytes); 1088*0d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 1089*0d0321e0SJeremy L Thompson ierr = cudaMemcpy(diag->d_emodein, emodein, numemodein * eBytes, 1090*0d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1091*0d0321e0SJeremy L Thompson ierr = cudaMalloc((void **)&diag->d_emodeout, numemodeout * eBytes); 1092*0d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 1093*0d0321e0SJeremy L Thompson ierr = cudaMemcpy(diag->d_emodeout, emodeout, numemodeout * eBytes, 1094*0d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 1095*0d0321e0SJeremy L Thompson 1096*0d0321e0SJeremy L Thompson // Restriction 1097*0d0321e0SJeremy L Thompson diag->diagrstr = rstrout; 1098*0d0321e0SJeremy L Thompson 1099*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1100*0d0321e0SJeremy L Thompson } 1101*0d0321e0SJeremy L Thompson 1102*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1103*0d0321e0SJeremy L Thompson // Assemble diagonal common code 1104*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1105*0d0321e0SJeremy L Thompson static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, 1106*0d0321e0SJeremy L Thompson CeedVector assembled, CeedRequest *request, const bool pointBlock) { 1107*0d0321e0SJeremy L Thompson int ierr; 1108*0d0321e0SJeremy L Thompson Ceed ceed; 1109*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1110*0d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 1111*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); 1112*0d0321e0SJeremy L Thompson 1113*0d0321e0SJeremy L Thompson // Assemble QFunction 1114*0d0321e0SJeremy L Thompson CeedVector assembledqf; 1115*0d0321e0SJeremy L Thompson CeedElemRestriction rstr; 1116*0d0321e0SJeremy L Thompson ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, 1117*0d0321e0SJeremy L Thompson &rstr, request); CeedChkBackend(ierr); 1118*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionDestroy(&rstr); CeedChkBackend(ierr); 1119*0d0321e0SJeremy L Thompson CeedScalar maxnorm = 0; 1120*0d0321e0SJeremy L Thompson ierr = CeedVectorNorm(assembledqf, CEED_NORM_MAX, &maxnorm); 1121*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 1122*0d0321e0SJeremy L Thompson 1123*0d0321e0SJeremy L Thompson // Setup 1124*0d0321e0SJeremy L Thompson if (!impl->diag) { 1125*0d0321e0SJeremy L Thompson ierr = CeedOperatorAssembleDiagonalSetup_Cuda(op, pointBlock); 1126*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 1127*0d0321e0SJeremy L Thompson } 1128*0d0321e0SJeremy L Thompson CeedOperatorDiag_Cuda *diag = impl->diag; 1129*0d0321e0SJeremy L Thompson assert(diag != NULL); 1130*0d0321e0SJeremy L Thompson 1131*0d0321e0SJeremy L Thompson // Restriction 1132*0d0321e0SJeremy L Thompson if (pointBlock && !diag->pbdiagrstr) { 1133*0d0321e0SJeremy L Thompson CeedElemRestriction pbdiagrstr; 1134*0d0321e0SJeremy L Thompson ierr = CreatePBRestriction(diag->diagrstr, &pbdiagrstr); CeedChkBackend(ierr); 1135*0d0321e0SJeremy L Thompson diag->pbdiagrstr = pbdiagrstr; 1136*0d0321e0SJeremy L Thompson } 1137*0d0321e0SJeremy L Thompson CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr; 1138*0d0321e0SJeremy L Thompson 1139*0d0321e0SJeremy L Thompson // Create diagonal vector 1140*0d0321e0SJeremy L Thompson CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag; 1141*0d0321e0SJeremy L Thompson if (!elemdiag) { 1142*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag); 1143*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 1144*0d0321e0SJeremy L Thompson if (pointBlock) 1145*0d0321e0SJeremy L Thompson diag->pbelemdiag = elemdiag; 1146*0d0321e0SJeremy L Thompson else 1147*0d0321e0SJeremy L Thompson diag->elemdiag = elemdiag; 1148*0d0321e0SJeremy L Thompson } 1149*0d0321e0SJeremy L Thompson ierr = CeedVectorSetValue(elemdiag, 0.0); CeedChkBackend(ierr); 1150*0d0321e0SJeremy L Thompson 1151*0d0321e0SJeremy L Thompson // Assemble element operator diagonals 1152*0d0321e0SJeremy L Thompson CeedScalar *elemdiagarray; 1153*0d0321e0SJeremy L Thompson const CeedScalar *assembledqfarray; 1154*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray); 1155*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 1156*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray); 1157*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 1158*0d0321e0SJeremy L Thompson CeedInt nelem; 1159*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(diagrstr, &nelem); 1160*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 1161*0d0321e0SJeremy L Thompson 1162*0d0321e0SJeremy L Thompson // Compute the diagonal of B^T D B 1163*0d0321e0SJeremy L Thompson int elemsPerBlock = 1; 1164*0d0321e0SJeremy L Thompson int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0); 1165*0d0321e0SJeremy L Thompson void *args[] = {(void *) &nelem, (void *) &maxnorm, &diag->d_identity, 1166*0d0321e0SJeremy L Thompson &diag->d_interpin, &diag->d_gradin, &diag->d_interpout, 1167*0d0321e0SJeremy L Thompson &diag->d_gradout, &diag->d_emodein, &diag->d_emodeout, 1168*0d0321e0SJeremy L Thompson &assembledqfarray, &elemdiagarray 1169*0d0321e0SJeremy L Thompson }; 1170*0d0321e0SJeremy L Thompson if (pointBlock) { 1171*0d0321e0SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, diag->linearPointBlock, grid, 1172*0d0321e0SJeremy L Thompson diag->nnodes, 1, elemsPerBlock, args); 1173*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 1174*0d0321e0SJeremy L Thompson } else { 1175*0d0321e0SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, diag->linearDiagonal, grid, 1176*0d0321e0SJeremy L Thompson diag->nnodes, 1, elemsPerBlock, args); 1177*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 1178*0d0321e0SJeremy L Thompson } 1179*0d0321e0SJeremy L Thompson 1180*0d0321e0SJeremy L Thompson // Restore arrays 1181*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArray(elemdiag, &elemdiagarray); CeedChkBackend(ierr); 1182*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray); 1183*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 1184*0d0321e0SJeremy L Thompson 1185*0d0321e0SJeremy L Thompson // Assemble local operator diagonal 1186*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag, 1187*0d0321e0SJeremy L Thompson assembled, request); CeedChkBackend(ierr); 1188*0d0321e0SJeremy L Thompson 1189*0d0321e0SJeremy L Thompson // Cleanup 1190*0d0321e0SJeremy L Thompson ierr = CeedVectorDestroy(&assembledqf); CeedChkBackend(ierr); 1191*0d0321e0SJeremy L Thompson 1192*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1193*0d0321e0SJeremy L Thompson } 1194*0d0321e0SJeremy L Thompson 1195*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1196*0d0321e0SJeremy L Thompson // Assemble composite diagonal common code 1197*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1198*0d0321e0SJeremy L Thompson static inline int CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda( 1199*0d0321e0SJeremy L Thompson CeedOperator op, CeedVector assembled, CeedRequest *request, 1200*0d0321e0SJeremy L Thompson const bool pointBlock) { 1201*0d0321e0SJeremy L Thompson int ierr; 1202*0d0321e0SJeremy L Thompson CeedInt numSub; 1203*0d0321e0SJeremy L Thompson CeedOperator *subOperators; 1204*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetNumSub(op, &numSub); CeedChkBackend(ierr); 1205*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetSubList(op, &subOperators); CeedChkBackend(ierr); 1206*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < numSub; i++) { 1207*0d0321e0SJeremy L Thompson ierr = CeedOperatorAssembleDiagonalCore_Cuda(subOperators[i], assembled, 1208*0d0321e0SJeremy L Thompson request, pointBlock); CeedChkBackend(ierr); 1209*0d0321e0SJeremy L Thompson } 1210*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1211*0d0321e0SJeremy L Thompson } 1212*0d0321e0SJeremy L Thompson 1213*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1214*0d0321e0SJeremy L Thompson // Assemble Linear Diagonal 1215*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1216*0d0321e0SJeremy L Thompson static int CeedOperatorLinearAssembleAddDiagonal_Cuda(CeedOperator op, 1217*0d0321e0SJeremy L Thompson CeedVector assembled, CeedRequest *request) { 1218*0d0321e0SJeremy L Thompson int ierr; 1219*0d0321e0SJeremy L Thompson bool isComposite; 1220*0d0321e0SJeremy L Thompson ierr = CeedOperatorIsComposite(op, &isComposite); CeedChkBackend(ierr); 1221*0d0321e0SJeremy L Thompson if (isComposite) { 1222*0d0321e0SJeremy L Thompson return CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda(op, assembled, 1223*0d0321e0SJeremy L Thompson request, false); 1224*0d0321e0SJeremy L Thompson } else { 1225*0d0321e0SJeremy L Thompson return CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, false); 1226*0d0321e0SJeremy L Thompson } 1227*0d0321e0SJeremy L Thompson } 1228*0d0321e0SJeremy L Thompson 1229*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1230*0d0321e0SJeremy L Thompson // Assemble Linear Point Block Diagonal 1231*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1232*0d0321e0SJeremy L Thompson static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op, 1233*0d0321e0SJeremy L Thompson CeedVector assembled, CeedRequest *request) { 1234*0d0321e0SJeremy L Thompson int ierr; 1235*0d0321e0SJeremy L Thompson bool isComposite; 1236*0d0321e0SJeremy L Thompson ierr = CeedOperatorIsComposite(op, &isComposite); CeedChkBackend(ierr); 1237*0d0321e0SJeremy L Thompson if (isComposite) { 1238*0d0321e0SJeremy L Thompson return CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda(op, assembled, 1239*0d0321e0SJeremy L Thompson request, true); 1240*0d0321e0SJeremy L Thompson } else { 1241*0d0321e0SJeremy L Thompson return CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, true); 1242*0d0321e0SJeremy L Thompson } 1243*0d0321e0SJeremy L Thompson } 1244*0d0321e0SJeremy L Thompson 1245*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1246*0d0321e0SJeremy L Thompson // Create operator 1247*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1248*0d0321e0SJeremy L Thompson int CeedOperatorCreate_Cuda(CeedOperator op) { 1249*0d0321e0SJeremy L Thompson int ierr; 1250*0d0321e0SJeremy L Thompson Ceed ceed; 1251*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1252*0d0321e0SJeremy L Thompson CeedOperator_Cuda *impl; 1253*0d0321e0SJeremy L Thompson 1254*0d0321e0SJeremy L Thompson ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); 1255*0d0321e0SJeremy L Thompson ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr); 1256*0d0321e0SJeremy L Thompson 1257*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", 1258*0d0321e0SJeremy L Thompson CeedOperatorLinearAssembleQFunction_Cuda); 1259*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 1260*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Operator", op, 1261*0d0321e0SJeremy L Thompson "LinearAssembleQFunctionUpdate", 1262*0d0321e0SJeremy L Thompson CeedOperatorLinearAssembleQFunctionUpdate_Cuda); 1263*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 1264*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", 1265*0d0321e0SJeremy L Thompson CeedOperatorLinearAssembleAddDiagonal_Cuda); 1266*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 1267*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Operator", op, 1268*0d0321e0SJeremy L Thompson "LinearAssembleAddPointBlockDiagonal", 1269*0d0321e0SJeremy L Thompson CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda); 1270*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 1271*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", 1272*0d0321e0SJeremy L Thompson CeedOperatorApplyAdd_Cuda); CeedChkBackend(ierr); 1273*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", 1274*0d0321e0SJeremy L Thompson CeedOperatorDestroy_Cuda); CeedChkBackend(ierr); 1275*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1276*0d0321e0SJeremy L Thompson } 1277*0d0321e0SJeremy L Thompson 1278*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1279*0d0321e0SJeremy L Thompson // Composite Operator Create 1280*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1281*0d0321e0SJeremy L Thompson int CeedCompositeOperatorCreate_Cuda(CeedOperator op) { 1282*0d0321e0SJeremy L Thompson int ierr; 1283*0d0321e0SJeremy L Thompson Ceed ceed; 1284*0d0321e0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1285*0d0321e0SJeremy L Thompson 1286*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", 1287*0d0321e0SJeremy L Thompson CeedOperatorLinearAssembleAddDiagonal_Cuda); 1288*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 1289*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Operator", op, 1290*0d0321e0SJeremy L Thompson "LinearAssembleAddPointBlockDiagonal", 1291*0d0321e0SJeremy L Thompson CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda); 1292*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 1293*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1294*0d0321e0SJeremy L Thompson } 1295*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1296