1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed-backend.h> 18 #include "ceed-cuda-gen.h" 19 #include "ceed-cuda-gen-operator-build.h" 20 #include "../cuda/ceed-cuda.h" 21 22 //------------------------------------------------------------------------------ 23 // Destroy operator 24 //------------------------------------------------------------------------------ 25 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) { 26 int ierr; 27 CeedOperator_Cuda_gen *impl; 28 ierr = CeedOperatorGetData(op, (void *)&impl); CeedChk(ierr); 29 ierr = CeedFree(&impl); CeedChk(ierr); 30 return 0; 31 } 32 33 //------------------------------------------------------------------------------ 34 // Apply and add to output 35 //------------------------------------------------------------------------------ 36 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec, 37 CeedVector outvec, CeedRequest *request) { 38 int ierr; 39 Ceed ceed; 40 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 41 CeedOperator_Cuda_gen *data; 42 ierr = CeedOperatorGetData(op, (void *)&data); CeedChk(ierr); 43 CeedQFunction qf; 44 CeedQFunction_Cuda_gen *qf_data; 45 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 46 ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 47 CeedInt nelem, numinputfields, numoutputfields; 48 ierr = CeedOperatorGetNumElements(op, &nelem); CeedChk(ierr); 49 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 50 CeedChk(ierr); 51 CeedOperatorField *opinputfields, *opoutputfields; 52 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 53 CeedChk(ierr); 54 CeedQFunctionField *qfinputfields, *qfoutputfields; 55 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 56 CeedChk(ierr); 57 CeedEvalMode emode; 58 CeedVector vec, outvecs[16] = {}; 59 60 //Creation of the operator 61 ierr = CeedCudaGenOperatorBuild(op); CeedChk(ierr); 62 63 // Input vectors 64 for (CeedInt i = 0; i < numinputfields; i++) { 65 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 66 CeedChk(ierr); 67 if (emode == CEED_EVAL_WEIGHT) { // Skip 68 data->fields.in[i] = NULL; 69 } else { 70 // Get input vector 71 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr); 72 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 73 ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.in[i]); 74 CeedChk(ierr); 75 } 76 } 77 78 // Output vectors 79 for (CeedInt i = 0; i < numoutputfields; i++) { 80 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 81 CeedChk(ierr); 82 if (emode == CEED_EVAL_WEIGHT) { // Skip 83 data->fields.out[i] = NULL; 84 } else { 85 // Get output vector 86 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 87 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 88 outvecs[i] = vec; 89 // Check for multiple output modes 90 CeedInt index = -1; 91 for (CeedInt j = 0; j < i; j++) { 92 if (vec == outvecs[j]) { 93 index = j; 94 break; 95 } 96 } 97 if (index == -1) { 98 ierr = CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.out[i]); 99 CeedChk(ierr); 100 } else { 101 data->fields.out[i] = data->fields.out[index]; 102 } 103 } 104 } 105 106 // Copy the context 107 size_t ctxsize; 108 ierr = CeedQFunctionGetContextSize(qf, &ctxsize); CeedChk(ierr); 109 if (ctxsize > 0) { 110 if (!qf_data->d_c) { 111 ierr = cudaMalloc(&qf_data->d_c, ctxsize); CeedChk_Cu(ceed, ierr); 112 } 113 void *ctx; 114 ierr = CeedQFunctionGetInnerContext(qf, &ctx); CeedChk(ierr); 115 ierr = cudaMemcpy(qf_data->d_c, ctx, ctxsize, cudaMemcpyHostToDevice); 116 CeedChk_Cu(ceed, ierr); 117 } 118 119 // Apply operator 120 void *opargs[] = {(void *) &nelem, &qf_data->d_c, &data->indices, 121 &data->fields, &data->B, &data->G, &data->W 122 }; 123 const CeedInt dim = data->dim; 124 const CeedInt Q1d = data->Q1d; 125 if (dim==1) { 126 const CeedInt elemsPerBlock = 32; 127 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 128 ? 1 : 0 ); 129 CeedInt sharedMem = elemsPerBlock*Q1d*sizeof(CeedScalar); 130 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, 1, elemsPerBlock, 131 sharedMem, opargs); 132 } else if (dim==2) { 133 const CeedInt elemsPerBlock = Q1d<4? 16 : 2; 134 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 135 ? 1 : 0 ); 136 CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 137 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, 138 elemsPerBlock, sharedMem, opargs); 139 } else if (dim==3) { 140 const CeedInt elemsPerBlock = Q1d<6? 4 : (Q1d<8? 2 : 1); 141 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 142 ? 1 : 0 ); 143 CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 144 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, 145 elemsPerBlock, sharedMem, opargs); 146 } 147 CeedChk(ierr); 148 149 // Restore input arrays 150 for (CeedInt i = 0; i < numinputfields; i++) { 151 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 152 CeedChk(ierr); 153 if (emode == CEED_EVAL_WEIGHT) { // Skip 154 } else { 155 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr); 156 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 157 ierr = CeedVectorRestoreArrayRead(vec, &data->fields.in[i]); 158 CeedChk(ierr); 159 } 160 } 161 162 // Restore output arrays 163 for (CeedInt i = 0; i < numoutputfields; i++) { 164 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 165 CeedChk(ierr); 166 if (emode == CEED_EVAL_WEIGHT) { // Skip 167 } else { 168 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 169 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 170 // Check for multiple output modes 171 CeedInt index = -1; 172 for (CeedInt j = 0; j < i; j++) { 173 if (vec == outvecs[j]) { 174 index = j; 175 break; 176 } 177 } 178 if (index == -1) { 179 ierr = CeedVectorRestoreArray(vec, &data->fields.out[i]); 180 CeedChk(ierr); 181 } 182 } 183 } 184 return 0; 185 } 186 187 //------------------------------------------------------------------------------ 188 // Assemble linear QFunction not supported 189 //------------------------------------------------------------------------------ 190 static int CeedOperatorAssembleLinearQFunction_Cuda(CeedOperator op) { 191 int ierr; 192 Ceed ceed; 193 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 194 return CeedError(ceed, 1, "Backend does not implement QFunction assembly"); 195 } 196 197 //------------------------------------------------------------------------------ 198 // Assemble linear diagonal not supported 199 //------------------------------------------------------------------------------ 200 static int CeedOperatorAssembleLinearDiagonal_Cuda(CeedOperator op) { 201 int ierr; 202 Ceed ceed; 203 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 204 return CeedError(ceed, 1, 205 "Backend does not implement Operator diagonal assembly"); 206 } 207 208 //------------------------------------------------------------------------------ 209 // Create FDM element inverse not supported 210 //------------------------------------------------------------------------------ 211 static int CeedOperatorCreateFDMElementInverse_Cuda(CeedOperator op) { 212 int ierr; 213 Ceed ceed; 214 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 215 return CeedError(ceed, 1, "Backend does not implement FDM inverse creation"); 216 } 217 218 //------------------------------------------------------------------------------ 219 // Create operator 220 //------------------------------------------------------------------------------ 221 int CeedOperatorCreate_Cuda_gen(CeedOperator op) { 222 int ierr; 223 Ceed ceed; 224 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 225 CeedOperator_Cuda_gen *impl; 226 227 ierr = CeedCalloc(1, &impl); CeedChk(ierr); 228 ierr = CeedOperatorSetData(op, (void *)&impl); CeedChk(ierr); 229 230 ierr = CeedSetBackendFunction(ceed, "Operator", op, "AssembleLinearQFunction", 231 CeedOperatorAssembleLinearQFunction_Cuda); 232 CeedChk(ierr); 233 ierr = CeedSetBackendFunction(ceed, "Operator", op, "AssembleLinearDiagonal", 234 CeedOperatorAssembleLinearDiagonal_Cuda); 235 CeedChk(ierr); 236 ierr = CeedSetBackendFunction(ceed, "Operator", op, "CreateFDMElementInverse", 237 CeedOperatorCreateFDMElementInverse_Cuda); 238 CeedChk(ierr); 239 ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", 240 CeedOperatorApplyAdd_Cuda_gen); CeedChk(ierr); 241 ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", 242 CeedOperatorDestroy_Cuda_gen); CeedChk(ierr); 243 return 0; 244 } 245 //------------------------------------------------------------------------------ 246