1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed-backend.h> 18 #include "ceed-cuda-gen.h" 19 #include "ceed-cuda-gen-operator-build.h" 20 #include "../cuda/ceed-cuda.h" 21 22 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) { 23 int ierr; 24 CeedOperator_Cuda_gen *impl; 25 ierr = CeedOperatorGetData(op, (void *)&impl); CeedChk(ierr); 26 27 ierr = CeedFree(&impl); CeedChk(ierr); 28 return 0; 29 } 30 31 static int CeedOperatorApply_Cuda_gen(CeedOperator op, CeedVector invec, 32 CeedVector outvec, CeedRequest *request) { 33 int ierr; 34 Ceed ceed; 35 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 36 CeedOperator_Cuda_gen *data; 37 ierr = CeedOperatorGetData(op, (void *)&data); CeedChk(ierr); 38 CeedQFunction qf; 39 CeedQFunction_Cuda_gen *qf_data; 40 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 41 ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 42 CeedInt nelem, numinputfields, numoutputfields; 43 ierr = CeedOperatorGetNumElements(op, &nelem); CeedChk(ierr); 44 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 45 CeedChk(ierr); 46 CeedOperatorField *opinputfields, *opoutputfields; 47 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 48 CeedChk(ierr); 49 CeedQFunctionField *qfinputfields, *qfoutputfields; 50 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 51 CeedChk(ierr); 52 CeedEvalMode emode; 53 CeedVector vec; 54 55 //Creation of the operator 56 ierr = CeedCudaGenOperatorBuild(op); CeedChk(ierr); 57 58 // Zero lvecs 59 for (CeedInt i = 0; i < numoutputfields; i++) { 60 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 61 if (vec == CEED_VECTOR_ACTIVE) 62 vec = outvec; 63 ierr = CeedVectorSetValue(vec, 0.0); CeedChk(ierr); 64 } 65 66 // Input vectors 67 for (CeedInt i = 0; i < numinputfields; i++) { 68 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 69 CeedChk(ierr); 70 if (emode == CEED_EVAL_WEIGHT) { // Skip 71 data->fields.in[i] = NULL; 72 } else { 73 // Get input vector 74 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr); 75 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 76 ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.in[i]); 77 CeedChk(ierr); 78 } 79 } 80 81 // Output vectors 82 for (CeedInt i = 0; i < numoutputfields; i++) { 83 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 84 CeedChk(ierr); 85 if (emode == CEED_EVAL_WEIGHT) { // Skip 86 data->fields.out[i] = NULL; 87 } else { 88 // Get output vector 89 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 90 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 91 ierr = CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.out[i]); 92 CeedChk(ierr); 93 } 94 } 95 96 // Copy the context 97 size_t ctxsize; 98 ierr = CeedQFunctionGetContextSize(qf, &ctxsize); CeedChk(ierr); 99 if (ctxsize > 0) { 100 if(!qf_data->d_c) { 101 ierr = cudaMalloc(&qf_data->d_c, ctxsize); CeedChk_Cu(ceed, ierr); 102 } 103 void *ctx; 104 ierr = CeedQFunctionGetInnerContext(qf, &ctx); CeedChk(ierr); 105 ierr = cudaMemcpy(qf_data->d_c, ctx, ctxsize, cudaMemcpyHostToDevice); 106 CeedChk_Cu(ceed, ierr); 107 } 108 109 // Apply operator 110 void *opargs[] = {(void *) &nelem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W}; 111 const CeedInt dim = data->dim; 112 const CeedInt Q1d = data->Q1d; 113 if (dim==1) { 114 const CeedInt elemsPerBlock = 32; 115 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 116 ? 1 : 0 ); 117 CeedInt sharedMem = elemsPerBlock*Q1d*sizeof(CeedScalar); 118 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, 1, elemsPerBlock, 119 sharedMem, opargs); 120 } else if (dim==2) { 121 const CeedInt elemsPerBlock = Q1d<4? 16 : 2; 122 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 123 ? 1 : 0 ); 124 CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 125 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, elemsPerBlock, 126 sharedMem, opargs); 127 } else if (dim==3) { 128 const CeedInt elemsPerBlock = Q1d<8? 4 : 1; 129 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 130 ? 1 : 0 ); 131 CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 132 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, elemsPerBlock, 133 sharedMem, opargs); 134 } 135 CeedChk(ierr); 136 137 // Restore input arrays 138 for (CeedInt i = 0; i < numinputfields; i++) { 139 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 140 CeedChk(ierr); 141 if (emode == CEED_EVAL_WEIGHT) { // Skip 142 } else { 143 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr); 144 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 145 ierr = CeedVectorRestoreArrayRead(vec, &data->fields.in[i]); 146 CeedChk(ierr); 147 } 148 } 149 150 // Restore output arrays 151 for (CeedInt i = 0; i < numoutputfields; i++) { 152 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 153 CeedChk(ierr); 154 if (emode == CEED_EVAL_WEIGHT) { // Skip 155 } else { 156 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 157 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 158 ierr = CeedVectorRestoreArray(vec, &data->fields.out[i]); 159 CeedChk(ierr); 160 } 161 } 162 163 return 0; 164 } 165 166 int CeedOperatorCreate_Cuda_gen(CeedOperator op) { 167 int ierr; 168 Ceed ceed; 169 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 170 CeedOperator_Cuda_gen *impl; 171 172 ierr = CeedCalloc(1, &impl); CeedChk(ierr); 173 ierr = CeedOperatorSetData(op, (void *)&impl); 174 175 ierr = CeedSetBackendFunction(ceed, "Operator", op, "Apply", 176 CeedOperatorApply_Cuda_gen); CeedChk(ierr); 177 ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", 178 CeedOperatorDestroy_Cuda_gen); CeedChk(ierr); 179 return 0; 180 } 181 182 int CeedCompositeOperatorCreate_Cuda_gen(CeedOperator op) { 183 int ierr; 184 Ceed ceed; 185 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 186 return CeedError(ceed, 1, "Backend does not implement composite operators"); 187 } 188