1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed-backend.h> 18 #include "ceed-cuda-gen.h" 19 #include "ceed-cuda-gen-operator-build.h" 20 #include "../cuda/ceed-cuda.h" 21 22 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) { 23 int ierr; 24 Ceed ceed; 25 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 26 CeedOperator_Cuda_gen *impl; 27 ierr = CeedOperatorGetData(op, (void *)&impl); CeedChk(ierr); 28 29 for (int i=0; i<16; i++) { 30 ierr = cudaFree(impl->strides.in[i]); CeedChk_Cu(ceed, ierr); 31 ierr = cudaFree(impl->strides.out[i]); CeedChk_Cu(ceed, ierr); 32 } 33 34 ierr = CeedFree(&impl); CeedChk(ierr); 35 return 0; 36 } 37 38 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec, 39 CeedVector outvec, CeedRequest *request) { 40 int ierr; 41 Ceed ceed; 42 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 43 CeedOperator_Cuda_gen *data; 44 ierr = CeedOperatorGetData(op, (void *)&data); CeedChk(ierr); 45 CeedQFunction qf; 46 CeedQFunction_Cuda_gen *qf_data; 47 ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 48 ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 49 CeedInt nelem, numinputfields, numoutputfields; 50 ierr = CeedOperatorGetNumElements(op, &nelem); CeedChk(ierr); 51 ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 52 CeedChk(ierr); 53 CeedOperatorField *opinputfields, *opoutputfields; 54 ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 55 CeedChk(ierr); 56 CeedQFunctionField *qfinputfields, *qfoutputfields; 57 ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 58 CeedChk(ierr); 59 CeedEvalMode emode; 60 CeedVector vec; 61 62 //Creation of the operator 63 ierr = CeedCudaGenOperatorBuild(op); CeedChk(ierr); 64 65 // Input vectors 66 for (CeedInt i = 0; i < numinputfields; i++) { 67 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 68 CeedChk(ierr); 69 if (emode == CEED_EVAL_WEIGHT) { // Skip 70 data->fields.in[i] = NULL; 71 } else { 72 // Get input vector 73 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr); 74 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 75 ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.in[i]); 76 CeedChk(ierr); 77 } 78 } 79 80 // Output vectors 81 for (CeedInt i = 0; i < numoutputfields; i++) { 82 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 83 CeedChk(ierr); 84 if (emode == CEED_EVAL_WEIGHT) { // Skip 85 data->fields.out[i] = NULL; 86 } else { 87 // Get output vector 88 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 89 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 90 ierr = CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.out[i]); 91 CeedChk(ierr); 92 } 93 } 94 95 // Copy the context 96 size_t ctxsize; 97 ierr = CeedQFunctionGetContextSize(qf, &ctxsize); CeedChk(ierr); 98 if (ctxsize > 0) { 99 if (!qf_data->d_c) { 100 ierr = cudaMalloc(&qf_data->d_c, ctxsize); CeedChk_Cu(ceed, ierr); 101 } 102 void *ctx; 103 ierr = CeedQFunctionGetInnerContext(qf, &ctx); CeedChk(ierr); 104 ierr = cudaMemcpy(qf_data->d_c, ctx, ctxsize, cudaMemcpyHostToDevice); 105 CeedChk_Cu(ceed, ierr); 106 } 107 108 // Apply operator 109 void *opargs[] = {(void *) &nelem, &qf_data->d_c, &data->indices, 110 &data->strides, &data->fields, &data->B, &data->G, &data->W 111 }; 112 const CeedInt dim = data->dim; 113 const CeedInt Q1d = data->Q1d; 114 if (dim==1) { 115 const CeedInt elemsPerBlock = 32; 116 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 117 ? 1 : 0 ); 118 CeedInt sharedMem = elemsPerBlock*Q1d*sizeof(CeedScalar); 119 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, 1, elemsPerBlock, 120 sharedMem, opargs); 121 } else if (dim==2) { 122 const CeedInt elemsPerBlock = Q1d<4? 16 : 2; 123 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 124 ? 1 : 0 ); 125 CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 126 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, 127 elemsPerBlock, sharedMem, opargs); 128 } else if (dim==3) { 129 const CeedInt elemsPerBlock = Q1d<6? 4 : (Q1d<8? 2 : 1); 130 CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem) 131 ? 1 : 0 ); 132 CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar); 133 ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, 134 elemsPerBlock, sharedMem, opargs); 135 } 136 CeedChk(ierr); 137 138 // Restore input arrays 139 for (CeedInt i = 0; i < numinputfields; i++) { 140 ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 141 CeedChk(ierr); 142 if (emode == CEED_EVAL_WEIGHT) { // Skip 143 } else { 144 ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr); 145 if (vec == CEED_VECTOR_ACTIVE) vec = invec; 146 ierr = CeedVectorRestoreArrayRead(vec, &data->fields.in[i]); 147 CeedChk(ierr); 148 } 149 } 150 151 // Restore output arrays 152 for (CeedInt i = 0; i < numoutputfields; i++) { 153 ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 154 CeedChk(ierr); 155 if (emode == CEED_EVAL_WEIGHT) { // Skip 156 } else { 157 ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr); 158 if (vec == CEED_VECTOR_ACTIVE) vec = outvec; 159 ierr = CeedVectorRestoreArray(vec, &data->fields.out[i]); 160 CeedChk(ierr); 161 } 162 } 163 164 return 0; 165 } 166 167 static int CeedOperatorAssembleLinearQFunction_Cuda(CeedOperator op) { 168 int ierr; 169 Ceed ceed; 170 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 171 return CeedError(ceed, 1, "Backend does not implement QFunction assembly"); 172 } 173 174 int CeedOperatorCreate_Cuda_gen(CeedOperator op) { 175 int ierr; 176 Ceed ceed; 177 ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 178 CeedOperator_Cuda_gen *impl; 179 180 ierr = CeedCalloc(1, &impl); CeedChk(ierr); 181 ierr = CeedOperatorSetData(op, (void *)&impl); 182 183 ierr = CeedSetBackendFunction(ceed, "Operator", op, "AssembleLinearQFunction", 184 CeedOperatorAssembleLinearQFunction_Cuda); 185 CeedChk(ierr); 186 ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", 187 CeedOperatorApplyAdd_Cuda_gen); CeedChk(ierr); 188 ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", 189 CeedOperatorDestroy_Cuda_gen); CeedChk(ierr); 190 return 0; 191 } 192