1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed-backend.h> 18 #include "ceed-cuda-gen.h" 19 #include "ceed-cuda-gen-operator-build.h" 20 #include "../cuda/ceed-cuda.h" 21 22 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) { 23 int ierr; 24 Ceed ceed; 25 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 26 CeedOperator_Cuda_gen *impl; 27 ierr = CeedOperatorGetData(op, (void *)&impl); CeedChk(ierr); 28 ierr = CeedFree(&impl); CeedChk(ierr); 29 return 0; 30 } 31 32 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec, 33 CeedVector outvec, CeedRequest *request) { 34 int ierr; 35 Ceed ceed; 36 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 37 CeedOperator_Cuda_gen *data; 38 ierr = CeedOperatorGetData(op, (void *)&data); CeedChk(ierr); 39 CeedQFunction qf; 40 CeedQFunction_Cuda_gen *qf_data; 41 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 42 ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 43 CeedInt nelem, numinputfields, numoutputfields; 44 ierr = CeedOperatorGetNumElements(op, &nelem); CeedChk(ierr); 45 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 46 CeedChk(ierr); 47 CeedOperatorField *opinputfields, *opoutputfields; 48 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 49 CeedChk(ierr); 50 CeedQFunctionField *qfinputfields, *qfoutputfields; 51 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 52 CeedChk(ierr); 53 CeedEvalMode emode; 54 CeedVector vec; 55 56 //Creation of the operator 57 ierr = CeedCudaGenOperatorBuild(op); CeedChk(ierr); 58 59 // Input vectors 60 for (CeedInt i = 0; i < numinputfields; i++) { 61 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 62 CeedChk(ierr); 63 if (emode == CEED_EVAL_WEIGHT) { // Skip 64 data->fields.in[i] = NULL; 65 } else { 66 // Get input vector 67 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr); 68 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 69 ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.in[i]); 70 CeedChk(ierr); 71 } 72 } 73 74 // Output vectors 75 for (CeedInt i = 0; i < numoutputfields; i++) { 76 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 77 CeedChk(ierr); 78 if (emode == CEED_EVAL_WEIGHT) { // Skip 79 data->fields.out[i] = NULL; 80 } else { 81 // Get output vector 82 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 83 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 84 ierr = CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.out[i]); 85 CeedChk(ierr); 86 } 87 } 88 89 // Copy the context 90 size_t ctxsize; 91 ierr = CeedQFunctionGetContextSize(qf, &ctxsize); CeedChk(ierr); 92 if (ctxsize > 0) { 93 if (!qf_data->d_c) { 94 ierr = cudaMalloc(&qf_data->d_c, ctxsize); CeedChk_Cu(ceed, ierr); 95 } 96 void *ctx; 97 ierr = CeedQFunctionGetInnerContext(qf, &ctx); CeedChk(ierr); 98 ierr = cudaMemcpy(qf_data->d_c, ctx, ctxsize, cudaMemcpyHostToDevice); 99 CeedChk_Cu(ceed, ierr); 100 } 101 102 // Apply operator 103 void *opargs[] = {(void *) &nelem, &qf_data->d_c, &data->indices, 104 &data->fields, &data->B, &data->G, &data->W 105 }; 106 const CeedInt dim = data->dim; 107 const CeedInt Q1d = data->Q1d; 108 if (dim==1) { 109 const CeedInt elemsPerBlock = 32; 110 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 111 ? 1 : 0 ); 112 CeedInt sharedMem = elemsPerBlock*Q1d*sizeof(CeedScalar); 113 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, 1, elemsPerBlock, 114 sharedMem, opargs); 115 } else if (dim==2) { 116 const CeedInt elemsPerBlock = Q1d<4? 16 : 2; 117 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 118 ? 1 : 0 ); 119 CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 120 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, 121 elemsPerBlock, sharedMem, opargs); 122 } else if (dim==3) { 123 const CeedInt elemsPerBlock = Q1d<6? 4 : (Q1d<8? 2 : 1); 124 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 125 ? 1 : 0 ); 126 CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 127 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, 128 elemsPerBlock, sharedMem, opargs); 129 } 130 CeedChk(ierr); 131 132 // Restore input arrays 133 for (CeedInt i = 0; i < numinputfields; i++) { 134 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 135 CeedChk(ierr); 136 if (emode == CEED_EVAL_WEIGHT) { // Skip 137 } else { 138 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr); 139 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 140 ierr = CeedVectorRestoreArrayRead(vec, &data->fields.in[i]); 141 CeedChk(ierr); 142 } 143 } 144 145 // Restore output arrays 146 for (CeedInt i = 0; i < numoutputfields; i++) { 147 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 148 CeedChk(ierr); 149 if (emode == CEED_EVAL_WEIGHT) { // Skip 150 } else { 151 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 152 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 153 ierr = CeedVectorRestoreArray(vec, &data->fields.out[i]); 154 CeedChk(ierr); 155 } 156 } 157 158 return 0; 159 } 160 161 static int CeedOperatorAssembleLinearQFunction_Cuda(CeedOperator op) { 162 int ierr; 163 Ceed ceed; 164 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 165 return CeedError(ceed, 1, "Backend does not implement QFunction assembly"); 166 } 167 168 int CeedOperatorCreate_Cuda_gen(CeedOperator op) { 169 int ierr; 170 Ceed ceed; 171 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 172 CeedOperator_Cuda_gen *impl; 173 174 ierr = CeedCalloc(1, &impl); CeedChk(ierr); 175 ierr = CeedOperatorSetData(op, (void *)&impl); 176 177 ierr = CeedSetBackendFunction(ceed, "Operator", op, "AssembleLinearQFunction", 178 CeedOperatorAssembleLinearQFunction_Cuda); 179 CeedChk(ierr); 180 ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", 181 CeedOperatorApplyAdd_Cuda_gen); CeedChk(ierr); 182 ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", 183 CeedOperatorDestroy_Cuda_gen); CeedChk(ierr); 184 return 0; 185 } 186