1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed-backend.h> 18 #include "ceed-cuda-gen.h" 19 #include "ceed-cuda-gen-operator-build.h" 20 #include "../cuda/ceed-cuda.h" 21 22 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) { 23 int ierr; 24 CeedOperator_Cuda_gen *impl; 25 ierr = CeedOperatorGetData(op, (void *)&impl); CeedChk(ierr); 26 ierr = CeedFree(&impl); CeedChk(ierr); 27 return 0; 28 } 29 30 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec, 31 CeedVector outvec, CeedRequest *request) { 32 int ierr; 33 Ceed ceed; 34 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 35 CeedOperator_Cuda_gen *data; 36 ierr = CeedOperatorGetData(op, (void *)&data); CeedChk(ierr); 37 CeedQFunction qf; 38 CeedQFunction_Cuda_gen *qf_data; 39 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 40 ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 41 CeedInt nelem, numinputfields, numoutputfields; 42 ierr = CeedOperatorGetNumElements(op, &nelem); CeedChk(ierr); 43 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 44 CeedChk(ierr); 45 CeedOperatorField *opinputfields, *opoutputfields; 46 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 47 CeedChk(ierr); 48 CeedQFunctionField *qfinputfields, *qfoutputfields; 49 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 50 CeedChk(ierr); 51 CeedEvalMode emode; 52 CeedVector vec, outvecs[16] = {}; 53 54 //Creation of the operator 55 ierr = CeedCudaGenOperatorBuild(op); CeedChk(ierr); 56 57 // Input vectors 58 for (CeedInt i = 0; i < numinputfields; i++) { 59 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 60 CeedChk(ierr); 61 if (emode == CEED_EVAL_WEIGHT) { // Skip 62 data->fields.in[i] = NULL; 63 } else { 64 // Get input vector 65 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr); 66 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 67 ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.in[i]); 68 CeedChk(ierr); 69 } 70 } 71 72 // Output vectors 73 for (CeedInt i = 0; i < numoutputfields; i++) { 74 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 75 CeedChk(ierr); 76 if (emode == CEED_EVAL_WEIGHT) { // Skip 77 data->fields.out[i] = NULL; 78 } else { 79 // Get output vector 80 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 81 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 82 outvecs[i] = vec; 83 // Check for multiple output modes 84 CeedInt index = -1; 85 for (CeedInt j = 0; j < i; j++) { 86 if (vec == outvecs[j]) { 87 index = j; 88 break; 89 } 90 } 91 if (index == -1) { 92 ierr = CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.out[i]); 93 CeedChk(ierr); 94 } else { 95 data->fields.out[i] = data->fields.out[index]; 96 } 97 } 98 } 99 100 // Copy the context 101 size_t ctxsize; 102 ierr = CeedQFunctionGetContextSize(qf, &ctxsize); CeedChk(ierr); 103 if (ctxsize > 0) { 104 if (!qf_data->d_c) { 105 ierr = cudaMalloc(&qf_data->d_c, ctxsize); CeedChk_Cu(ceed, ierr); 106 } 107 void *ctx; 108 ierr = CeedQFunctionGetInnerContext(qf, &ctx); CeedChk(ierr); 109 ierr = cudaMemcpy(qf_data->d_c, ctx, ctxsize, cudaMemcpyHostToDevice); 110 CeedChk_Cu(ceed, ierr); 111 } 112 113 // Apply operator 114 void *opargs[] = {(void *) &nelem, &qf_data->d_c, &data->indices, 115 &data->fields, &data->B, &data->G, &data->W 116 }; 117 const CeedInt dim = data->dim; 118 const CeedInt Q1d = data->Q1d; 119 if (dim==1) { 120 const CeedInt elemsPerBlock = 32; 121 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 122 ? 1 : 0 ); 123 CeedInt sharedMem = elemsPerBlock*Q1d*sizeof(CeedScalar); 124 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, 1, elemsPerBlock, 125 sharedMem, opargs); 126 } else if (dim==2) { 127 const CeedInt elemsPerBlock = Q1d<4? 16 : 2; 128 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 129 ? 1 : 0 ); 130 CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 131 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, 132 elemsPerBlock, sharedMem, opargs); 133 } else if (dim==3) { 134 const CeedInt elemsPerBlock = Q1d<6? 4 : (Q1d<8? 2 : 1); 135 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 136 ? 1 : 0 ); 137 CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 138 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, 139 elemsPerBlock, sharedMem, opargs); 140 } 141 CeedChk(ierr); 142 143 // Restore input arrays 144 for (CeedInt i = 0; i < numinputfields; i++) { 145 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 146 CeedChk(ierr); 147 if (emode == CEED_EVAL_WEIGHT) { // Skip 148 } else { 149 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr); 150 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 151 ierr = CeedVectorRestoreArrayRead(vec, &data->fields.in[i]); 152 CeedChk(ierr); 153 } 154 } 155 156 // Restore output arrays 157 for (CeedInt i = 0; i < numoutputfields; i++) { 158 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 159 CeedChk(ierr); 160 if (emode == CEED_EVAL_WEIGHT) { // Skip 161 } else { 162 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 163 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 164 // Check for multiple output modes 165 CeedInt index = -1; 166 for (CeedInt j = 0; j < i; j++) { 167 if (vec == outvecs[j]) { 168 index = j; 169 break; 170 } 171 } 172 if (index == -1) { 173 ierr = CeedVectorRestoreArray(vec, &data->fields.out[i]); 174 CeedChk(ierr); 175 } 176 } 177 } 178 179 return 0; 180 } 181 182 static int CeedOperatorAssembleLinearQFunction_Cuda(CeedOperator op) { 183 int ierr; 184 Ceed ceed; 185 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 186 return CeedError(ceed, 1, "Backend does not implement QFunction assembly"); 187 } 188 189 static int CeedOperatorAssembleLinearDiagonal_Cuda(CeedOperator op) { 190 int ierr; 191 Ceed ceed; 192 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 193 return CeedError(ceed, 1, 194 "Backend does not implement Operator diagonal assembly"); 195 } 196 197 static int CeedOperatorCreateFDMElementInverse_Cuda(CeedOperator op) { 198 int ierr; 199 Ceed ceed; 200 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 201 return CeedError(ceed, 1, "Backend does not implement FDM inverse creation"); 202 } 203 204 int CeedOperatorCreate_Cuda_gen(CeedOperator op) { 205 int ierr; 206 Ceed ceed; 207 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 208 CeedOperator_Cuda_gen *impl; 209 210 ierr = CeedCalloc(1, &impl); CeedChk(ierr); 211 ierr = CeedOperatorSetData(op, (void *)&impl); 212 213 ierr = CeedSetBackendFunction(ceed, "Operator", op, "AssembleLinearQFunction", 214 CeedOperatorAssembleLinearQFunction_Cuda); 215 CeedChk(ierr); 216 ierr = CeedSetBackendFunction(ceed, "Operator", op, "AssembleLinearDiagonal", 217 CeedOperatorAssembleLinearDiagonal_Cuda); 218 CeedChk(ierr); 219 ierr = CeedSetBackendFunction(ceed, "Operator", op, "CreateFDMElementInverse", 220 CeedOperatorCreateFDMElementInverse_Cuda); 221 CeedChk(ierr); 222 ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", 223 CeedOperatorApplyAdd_Cuda_gen); CeedChk(ierr); 224 ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", 225 CeedOperatorDestroy_Cuda_gen); CeedChk(ierr); 226 return 0; 227 } 228