1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include "ceed-cuda-gen.h" 18 #include "ceed-cuda-gen-operator-build.h" 19 20 //------------------------------------------------------------------------------ 21 // Destroy operator 22 //------------------------------------------------------------------------------ 23 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) { 24 int ierr; 25 CeedOperator_Cuda_gen *impl; 26 ierr = CeedOperatorGetData(op, (void *)&impl); CeedChk(ierr); 27 ierr = CeedFree(&impl); CeedChk(ierr); 28 return 0; 29 } 30 31 //------------------------------------------------------------------------------ 32 // Apply and add to output 33 //------------------------------------------------------------------------------ 34 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec, 35 CeedVector outvec, CeedRequest *request) { 36 int ierr; 37 Ceed ceed; 38 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 39 CeedOperator_Cuda_gen *data; 40 ierr = CeedOperatorGetData(op, (void *)&data); CeedChk(ierr); 41 CeedQFunction qf; 42 CeedQFunction_Cuda_gen *qf_data; 43 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 44 ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 45 CeedInt nelem, numinputfields, numoutputfields; 46 ierr = CeedOperatorGetNumElements(op, &nelem); CeedChk(ierr); 47 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 48 CeedChk(ierr); 49 CeedOperatorField *opinputfields, *opoutputfields; 50 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 51 CeedChk(ierr); 52 CeedQFunctionField *qfinputfields, *qfoutputfields; 53 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 54 CeedChk(ierr); 55 CeedEvalMode emode; 56 CeedVector vec, outvecs[16] = {}; 57 58 //Creation of the operator 59 ierr = CeedCudaGenOperatorBuild(op); CeedChk(ierr); 60 61 // Input vectors 62 for (CeedInt i = 0; i < numinputfields; i++) { 63 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 64 CeedChk(ierr); 65 if (emode == CEED_EVAL_WEIGHT) { // Skip 66 data->fields.in[i] = NULL; 67 } else { 68 // Get input vector 69 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr); 70 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 71 ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.in[i]); 72 CeedChk(ierr); 73 } 74 } 75 76 // Output vectors 77 for (CeedInt i = 0; i < numoutputfields; i++) { 78 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 79 CeedChk(ierr); 80 if (emode == CEED_EVAL_WEIGHT) { // Skip 81 data->fields.out[i] = NULL; 82 } else { 83 // Get output vector 84 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 85 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 86 outvecs[i] = vec; 87 // Check for multiple output modes 88 CeedInt index = -1; 89 for (CeedInt j = 0; j < i; j++) { 90 if (vec == outvecs[j]) { 91 index = j; 92 break; 93 } 94 } 95 if (index == -1) { 96 ierr = CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.out[i]); 97 CeedChk(ierr); 98 } else { 99 data->fields.out[i] = data->fields.out[index]; 100 } 101 } 102 } 103 104 // Copy the context 105 size_t ctxsize; 106 ierr = CeedQFunctionGetContextSize(qf, &ctxsize); CeedChk(ierr); 107 if (ctxsize > 0) { 108 if (!qf_data->d_c) { 109 ierr = cudaMalloc(&qf_data->d_c, ctxsize); CeedChk_Cu(ceed, ierr); 110 } 111 void *ctx; 112 ierr = CeedQFunctionGetInnerContext(qf, &ctx); CeedChk(ierr); 113 ierr = cudaMemcpy(qf_data->d_c, ctx, ctxsize, cudaMemcpyHostToDevice); 114 CeedChk_Cu(ceed, ierr); 115 } 116 117 // Apply operator 118 void *opargs[] = {(void *) &nelem, &qf_data->d_c, &data->indices, 119 &data->fields, &data->B, &data->G, &data->W 120 }; 121 const CeedInt dim = data->dim; 122 const CeedInt Q1d = data->Q1d; 123 if (dim==1) { 124 const CeedInt elemsPerBlock = 32; 125 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 126 ? 1 : 0 ); 127 CeedInt sharedMem = elemsPerBlock*Q1d*sizeof(CeedScalar); 128 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, 1, elemsPerBlock, 129 sharedMem, opargs); 130 } else if (dim==2) { 131 const CeedInt elemsPerBlock = Q1d<4? 16 : 2; 132 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 133 ? 1 : 0 ); 134 CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 135 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, 136 elemsPerBlock, sharedMem, opargs); 137 } else if (dim==3) { 138 const CeedInt elemsPerBlock = Q1d<6? 4 : (Q1d<8? 2 : 1); 139 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 140 ? 1 : 0 ); 141 CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 142 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, 143 elemsPerBlock, sharedMem, opargs); 144 } 145 CeedChk(ierr); 146 147 // Restore input arrays 148 for (CeedInt i = 0; i < numinputfields; i++) { 149 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 150 CeedChk(ierr); 151 if (emode == CEED_EVAL_WEIGHT) { // Skip 152 } else { 153 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr); 154 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 155 ierr = CeedVectorRestoreArrayRead(vec, &data->fields.in[i]); 156 CeedChk(ierr); 157 } 158 } 159 160 // Restore output arrays 161 for (CeedInt i = 0; i < numoutputfields; i++) { 162 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 163 CeedChk(ierr); 164 if (emode == CEED_EVAL_WEIGHT) { // Skip 165 } else { 166 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 167 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 168 // Check for multiple output modes 169 CeedInt index = -1; 170 for (CeedInt j = 0; j < i; j++) { 171 if (vec == outvecs[j]) { 172 index = j; 173 break; 174 } 175 } 176 if (index == -1) { 177 ierr = CeedVectorRestoreArray(vec, &data->fields.out[i]); 178 CeedChk(ierr); 179 } 180 } 181 } 182 return 0; 183 } 184 185 //------------------------------------------------------------------------------ 186 // Assemble linear diagonal not supported 187 //------------------------------------------------------------------------------ 188 static int CeedOperatorLinearAssembleDiagonal_Cuda(CeedOperator op) { 189 int ierr; 190 Ceed ceed; 191 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 192 return CeedError(ceed, 1, 193 "Backend does not implement Operator diagonal assembly"); 194 } 195 196 //------------------------------------------------------------------------------ 197 // Assemble linear point block diagonal not supported 198 //------------------------------------------------------------------------------ 199 static int CeedOperatorLinearAssemblePointBlockDiagonal_Cuda(CeedOperator op) { 200 int ierr; 201 Ceed ceed; 202 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 203 return CeedError(ceed, 1, 204 "Backend does not implement Operator point block diagonal assembly"); 205 } 206 207 //------------------------------------------------------------------------------ 208 // Create FDM element inverse not supported 209 //------------------------------------------------------------------------------ 210 static int CeedOperatorCreateFDMElementInverse_Cuda(CeedOperator op) { 211 int ierr; 212 Ceed ceed; 213 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 214 return CeedError(ceed, 1, "Backend does not implement FDM inverse creation"); 215 } 216 217 //------------------------------------------------------------------------------ 218 // Create operator 219 //------------------------------------------------------------------------------ 220 int CeedOperatorCreate_Cuda_gen(CeedOperator op) { 221 int ierr; 222 Ceed ceed; 223 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 224 CeedOperator_Cuda_gen *impl; 225 226 ierr = CeedCalloc(1, &impl); CeedChk(ierr); 227 ierr = CeedOperatorSetData(op, (void *)&impl); CeedChk(ierr); 228 229 ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleDiagonal", 230 CeedOperatorLinearAssembleDiagonal_Cuda); 231 CeedChk(ierr); 232 ierr = CeedSetBackendFunction(ceed, "Operator", op, 233 "LinearAssemblePointBlockDiagonal", 234 CeedOperatorLinearAssemblePointBlockDiagonal_Cuda); 235 CeedChk(ierr); 236 ierr = CeedSetBackendFunction(ceed, "Operator", op, "CreateFDMElementInverse", 237 CeedOperatorCreateFDMElementInverse_Cuda); 238 CeedChk(ierr); 239 ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", 240 CeedOperatorApplyAdd_Cuda_gen); CeedChk(ierr); 241 ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", 242 CeedOperatorDestroy_Cuda_gen); CeedChk(ierr); 243 return 0; 244 } 245 //------------------------------------------------------------------------------ 246