1*241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2*241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3*241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4*241a4b83SYohann // 5*241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6*241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral 7*241a4b83SYohann // element discretizations for exascale applications. For more information and 8*241a4b83SYohann // source code availability see http://github.com/ceed. 9*241a4b83SYohann // 10*241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11*241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12*241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for 13*241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including 14*241a4b83SYohann // software, applications, hardware, advanced system engineering and early 15*241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative. 16*241a4b83SYohann 17*241a4b83SYohann #include <ceed-backend.h> 18*241a4b83SYohann #include "ceed-cuda-gen.h" 19*241a4b83SYohann #include "ceed-cuda-gen-operator-build.h" 20*241a4b83SYohann #include "../cuda/ceed-cuda.h" 21*241a4b83SYohann 22*241a4b83SYohann static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) { 23*241a4b83SYohann int ierr; 24*241a4b83SYohann CeedOperator_Cuda_gen *impl; 25*241a4b83SYohann ierr = CeedOperatorGetData(op, (void *)&impl); CeedChk(ierr); 26*241a4b83SYohann 27*241a4b83SYohann ierr = CeedFree(&impl); CeedChk(ierr); 28*241a4b83SYohann return 0; 29*241a4b83SYohann } 30*241a4b83SYohann 31*241a4b83SYohann static int CeedOperatorApply_Cuda_gen(CeedOperator op, CeedVector invec, 32*241a4b83SYohann CeedVector outvec, CeedRequest *request) { 33*241a4b83SYohann int ierr; 34*241a4b83SYohann Ceed ceed; 35*241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 36*241a4b83SYohann CeedOperator_Cuda_gen *data; 37*241a4b83SYohann ierr = CeedOperatorGetData(op, (void *)&data); CeedChk(ierr); 38*241a4b83SYohann CeedQFunction qf; 39*241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 40*241a4b83SYohann ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 41*241a4b83SYohann ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 42*241a4b83SYohann CeedInt nelem, numinputfields, numoutputfields; 43*241a4b83SYohann ierr = CeedOperatorGetNumElements(op, &nelem); CeedChk(ierr); 44*241a4b83SYohann ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 45*241a4b83SYohann CeedChk(ierr); 46*241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 47*241a4b83SYohann ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 48*241a4b83SYohann CeedChk(ierr); 49*241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 50*241a4b83SYohann ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 51*241a4b83SYohann CeedChk(ierr); 52*241a4b83SYohann CeedEvalMode emode; 53*241a4b83SYohann CeedVector vec; 54*241a4b83SYohann 55*241a4b83SYohann //Creation of the operator 56*241a4b83SYohann ierr = CeedCudaGenOperatorBuild(op); CeedChk(ierr); 57*241a4b83SYohann 58*241a4b83SYohann // Zero lvecs 59*241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 60*241a4b83SYohann ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 61*241a4b83SYohann if (vec == CEED_VECTOR_ACTIVE) 62*241a4b83SYohann vec = outvec; 63*241a4b83SYohann ierr = CeedVectorSetValue(vec, 0.0); CeedChk(ierr); 64*241a4b83SYohann } 65*241a4b83SYohann 66*241a4b83SYohann // Input vectors 67*241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 68*241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 69*241a4b83SYohann CeedChk(ierr); 70*241a4b83SYohann if (emode == CEED_EVAL_WEIGHT) { // Skip 71*241a4b83SYohann data->fields.in[i] = NULL; 72*241a4b83SYohann } else { 73*241a4b83SYohann // Get input vector 74*241a4b83SYohann ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr); 75*241a4b83SYohann if (vec == CEED_VECTOR_ACTIVE) vec = invec; 76*241a4b83SYohann ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.in[i]); 77*241a4b83SYohann CeedChk(ierr); 78*241a4b83SYohann } 79*241a4b83SYohann } 80*241a4b83SYohann 81*241a4b83SYohann // Output vectors 82*241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 83*241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 84*241a4b83SYohann CeedChk(ierr); 85*241a4b83SYohann if (emode == CEED_EVAL_WEIGHT) { // Skip 86*241a4b83SYohann data->fields.out[i] = NULL; 87*241a4b83SYohann } else { 88*241a4b83SYohann // Get output vector 89*241a4b83SYohann ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 90*241a4b83SYohann if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 91*241a4b83SYohann ierr = CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.out[i]); 92*241a4b83SYohann CeedChk(ierr); 93*241a4b83SYohann } 94*241a4b83SYohann } 95*241a4b83SYohann 96*241a4b83SYohann // Copy the context 97*241a4b83SYohann size_t ctxsize; 98*241a4b83SYohann ierr = CeedQFunctionGetContextSize(qf, &ctxsize); CeedChk(ierr); 99*241a4b83SYohann if (ctxsize > 0) { 100*241a4b83SYohann if(!qf_data->d_c) { 101*241a4b83SYohann ierr = cudaMalloc(&qf_data->d_c, ctxsize); CeedChk_Cu(ceed, ierr); 102*241a4b83SYohann } 103*241a4b83SYohann void *ctx; 104*241a4b83SYohann ierr = CeedQFunctionGetInnerContext(qf, &ctx); CeedChk(ierr); 105*241a4b83SYohann ierr = cudaMemcpy(qf_data->d_c, ctx, ctxsize, cudaMemcpyHostToDevice); 106*241a4b83SYohann CeedChk_Cu(ceed, ierr); 107*241a4b83SYohann } 108*241a4b83SYohann 109*241a4b83SYohann // Apply operator 110*241a4b83SYohann void *opargs[] = {(void *) &nelem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W}; 111*241a4b83SYohann const CeedInt dim = data->dim; 112*241a4b83SYohann const CeedInt Q1d = data->Q1d; 113*241a4b83SYohann if (dim==1) { 114*241a4b83SYohann const CeedInt elemsPerBlock = 32; 115*241a4b83SYohann CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 116*241a4b83SYohann ? 1 : 0 ); 117*241a4b83SYohann CeedInt sharedMem = elemsPerBlock*Q1d*sizeof(CeedScalar); 118*241a4b83SYohann ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, 1, elemsPerBlock, 119*241a4b83SYohann sharedMem, opargs); 120*241a4b83SYohann } else if (dim==2) { 121*241a4b83SYohann const CeedInt elemsPerBlock = Q1d<4? 16 : 2; 122*241a4b83SYohann CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 123*241a4b83SYohann ? 1 : 0 ); 124*241a4b83SYohann CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 125*241a4b83SYohann ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, elemsPerBlock, 126*241a4b83SYohann sharedMem, opargs); 127*241a4b83SYohann } else if (dim==3) { 128*241a4b83SYohann const CeedInt elemsPerBlock = Q1d<8? 4 : 1; 129*241a4b83SYohann CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 130*241a4b83SYohann ? 1 : 0 ); 131*241a4b83SYohann CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 132*241a4b83SYohann ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, elemsPerBlock, 133*241a4b83SYohann sharedMem, opargs); 134*241a4b83SYohann } 135*241a4b83SYohann CeedChk(ierr); 136*241a4b83SYohann 137*241a4b83SYohann // Restore input arrays 138*241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 139*241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 140*241a4b83SYohann CeedChk(ierr); 141*241a4b83SYohann if (emode == CEED_EVAL_WEIGHT) { // Skip 142*241a4b83SYohann } else { 143*241a4b83SYohann ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr); 144*241a4b83SYohann if (vec == CEED_VECTOR_ACTIVE) vec = invec; 145*241a4b83SYohann ierr = CeedVectorRestoreArrayRead(vec, &data->fields.in[i]); 146*241a4b83SYohann CeedChk(ierr); 147*241a4b83SYohann } 148*241a4b83SYohann } 149*241a4b83SYohann 150*241a4b83SYohann // Restore output arrays 151*241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 152*241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 153*241a4b83SYohann CeedChk(ierr); 154*241a4b83SYohann if (emode == CEED_EVAL_WEIGHT) { // Skip 155*241a4b83SYohann } else { 156*241a4b83SYohann ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 157*241a4b83SYohann if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 158*241a4b83SYohann ierr = CeedVectorRestoreArray(vec, &data->fields.out[i]); 159*241a4b83SYohann CeedChk(ierr); 160*241a4b83SYohann } 161*241a4b83SYohann } 162*241a4b83SYohann 163*241a4b83SYohann return 0; 164*241a4b83SYohann } 165*241a4b83SYohann 166*241a4b83SYohann int CeedOperatorCreate_Cuda_gen(CeedOperator op) { 167*241a4b83SYohann int ierr; 168*241a4b83SYohann Ceed ceed; 169*241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 170*241a4b83SYohann CeedOperator_Cuda_gen *impl; 171*241a4b83SYohann 172*241a4b83SYohann ierr = CeedCalloc(1, &impl); CeedChk(ierr); 173*241a4b83SYohann ierr = CeedOperatorSetData(op, (void *)&impl); 174*241a4b83SYohann 175*241a4b83SYohann ierr = CeedSetBackendFunction(ceed, "Operator", op, "Apply", 176*241a4b83SYohann CeedOperatorApply_Cuda_gen); CeedChk(ierr); 177*241a4b83SYohann ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", 178*241a4b83SYohann CeedOperatorDestroy_Cuda_gen); CeedChk(ierr); 179*241a4b83SYohann return 0; 180*241a4b83SYohann } 181*241a4b83SYohann 182*241a4b83SYohann int CeedCompositeOperatorCreate_Cuda_gen(CeedOperator op) { 183*241a4b83SYohann int ierr; 184*241a4b83SYohann Ceed ceed; 185*241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 186*241a4b83SYohann return CeedError(ceed, 1, "Backend does not implement composite operators"); 187*241a4b83SYohann } 188