1 #include <petsc/private/petscfeimpl.h> /*I "petscfe.h" I*/ 2 3 #if defined(PETSC_HAVE_OPENCL) 4 5 static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem) 6 { 7 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data; 8 9 PetscFunctionBegin; 10 PetscCall(clReleaseCommandQueue(ocl->queue_id)); 11 ocl->queue_id = 0; 12 PetscCall(clReleaseContext(ocl->ctx_id)); 13 ocl->ctx_id = 0; 14 PetscCall(PetscFree(ocl)); 15 PetscFunctionReturn(PETSC_SUCCESS); 16 } 17 18 #define PetscCallSTR(err) \ 19 do { \ 20 PetscCall(err); \ 21 string_tail += count; \ 22 PetscCheck(string_tail != end_of_buffer, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Buffer overflow"); \ 23 } while (0) 24 enum { 25 LAPLACIAN = 0, 26 ELASTICITY = 1 27 }; 28 29 /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */ 30 /* dim Number of spatial dimensions: 2 */ 31 /* N_b Number of basis functions: generated */ 32 /* N_{bt} Number of total basis functions: N_b * N_{comp} */ 33 /* N_q Number of quadrature points: generated */ 34 /* N_{bs} Number of block cells LCM(N_b, N_q) */ 35 /* N_{bst} Number of block cell components LCM(N_{bt}, N_q) */ 36 /* N_{bl} Number of concurrent blocks generated */ 37 /* N_t Number of threads: N_{bl} * N_{bs} */ 38 /* N_{cbc} Number of concurrent basis cells: N_{bl} * N_q */ 39 /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b */ 40 /* N_{sbc} Number of serial basis cells: N_{bs} / N_q */ 41 /* N_{sqc} Number of serial quadrature cells: N_{bs} / N_b */ 42 /* N_{cb} Number of serial cell batches: input */ 43 /* N_c Number of total cells: N_{cb}*N_{t}/N_{comp} */ 44 static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl) 45 { 46 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data; 47 PetscQuadrature q; 48 char *string_tail = *string_buffer; 49 char *end_of_buffer = *string_buffer + buffer_length; 50 char float_str[] = "float", double_str[] = "double"; 51 char *numeric_str = &(float_str[0]); 52 PetscInt op = ocl->op; 53 PetscBool useField = PETSC_FALSE; 54 PetscBool useFieldDer = PETSC_TRUE; 55 PetscBool useFieldAux = useAux; 56 PetscBool useFieldDerAux = PETSC_FALSE; 57 PetscBool useF0 = PETSC_TRUE; 58 PetscBool useF1 = PETSC_TRUE; 59 const PetscReal *points, *weights; 60 PetscTabulation T; 61 PetscInt dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c; 62 size_t count; 63 64 PetscFunctionBegin; 65 PetscCall(PetscFEGetSpatialDimension(fem, &dim)); 66 PetscCall(PetscFEGetDimension(fem, &N_b)); 67 PetscCall(PetscFEGetNumComponents(fem, &N_c)); 68 PetscCall(PetscFEGetQuadrature(fem, &q)); 69 PetscCall(PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights)); 70 PetscCheck(qNc == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %" PetscInt_FMT " components", qNc); 71 N_t = N_b * N_c * N_q * N_bl; 72 /* Enable device extension for double precision */ 73 if (ocl->realType == PETSC_DOUBLE) { 74 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 75 "#if defined(cl_khr_fp64)\n" 76 "# pragma OPENCL EXTENSION cl_khr_fp64: enable\n" 77 "#elif defined(cl_amd_fp64)\n" 78 "# pragma OPENCL EXTENSION cl_amd_fp64: enable\n" 79 "#endif\n", 80 &count)); 81 numeric_str = &(double_str[0]); 82 } 83 /* Kernel API */ 84 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 85 "\n" 86 "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n" 87 "{\n", 88 &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str)); 89 /* Quadrature */ 90 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 91 " /* Quadrature points\n" 92 " - (x1,y1,x2,y2,...) */\n" 93 " const %s points[%d] = {\n", 94 &count, numeric_str, N_q * dim)); 95 for (p = 0; p < N_q; ++p) { 96 for (d = 0; d < dim; ++d) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p * dim + d])); 97 } 98 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count)); 99 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 100 " /* Quadrature weights\n" 101 " - (v1,v2,...) */\n" 102 " const %s weights[%d] = {\n", 103 &count, numeric_str, N_q)); 104 for (p = 0; p < N_q; ++p) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p])); 105 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count)); 106 /* Basis Functions */ 107 PetscCall(PetscFEGetCellTabulation(fem, 1, &T)); 108 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 109 " /* Nodal basis function evaluations\n" 110 " - basis component is fastest varying, the basis function, then point */\n" 111 " const %s Basis[%d] = {\n", 112 &count, numeric_str, N_q * N_b * N_c)); 113 for (p = 0; p < N_q; ++p) { 114 for (b = 0; b < N_b; ++b) { 115 for (c = 0; c < N_c; ++c) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p * N_b + b) * N_c + c])); 116 } 117 } 118 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count)); 119 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 120 "\n" 121 " /* Nodal basis function derivative evaluations,\n" 122 " - derivative direction is fastest varying, then basis component, then basis function, then point */\n" 123 " const %s%d BasisDerivatives[%d] = {\n", 124 &count, numeric_str, dim, N_q * N_b * N_c)); 125 for (p = 0; p < N_q; ++p) { 126 for (b = 0; b < N_b; ++b) { 127 for (c = 0; c < N_c; ++c) { 128 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim)); 129 for (d = 0; d < dim; ++d) { 130 if (d > 0) { 131 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p * N_b + b) * dim + d) * N_c + c])); 132 } else { 133 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p * N_b + b) * dim + d) * N_c + c])); 134 } 135 } 136 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count)); 137 } 138 } 139 } 140 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count)); 141 /* Sizes */ 142 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 143 " const int dim = %d; // The spatial dimension\n" 144 " const int N_bl = %d; // The number of concurrent blocks\n" 145 " const int N_b = %d; // The number of basis functions\n" 146 " const int N_comp = %d; // The number of basis function components\n" 147 " const int N_bt = N_b*N_comp; // The total number of scalar basis functions\n" 148 " const int N_q = %d; // The number of quadrature points\n" 149 " const int N_bst = N_bt*N_q; // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n" 150 " const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl\n" 151 " const int N_bc = N_t/N_comp; // The number of cells per batch (N_b*N_q*N_bl)\n" 152 " const int N_sbc = N_bst / (N_q * N_comp);\n" 153 " const int N_sqc = N_bst / N_bt;\n" 154 " /*const int N_c = N_cb * N_bc;*/\n" 155 "\n" 156 " /* Calculated indices */\n" 157 " /*const int tidx = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n" 158 " const int tidx = get_local_id(0);\n" 159 " const int blidx = tidx / N_bst; // Block number for this thread\n" 160 " const int bidx = tidx %% N_bt; // Basis function mapped to this thread\n" 161 " const int cidx = tidx %% N_comp; // Basis component mapped to this thread\n" 162 " const int qidx = tidx %% N_q; // Quadrature point mapped to this thread\n" 163 " const int blbidx = tidx %% N_q + blidx*N_q; // Cell mapped to this thread in the basis phase\n" 164 " const int blqidx = tidx %% N_b + blidx*N_b; // Cell mapped to this thread in the quadrature phase\n" 165 " const int gidx = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n" 166 " const int Goffset = gidx*N_cb*N_bc;\n", 167 &count, dim, N_bl, N_b, N_c, N_q)); 168 /* Local memory */ 169 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 170 "\n" 171 " /* Quadrature data */\n" 172 " %s w; // $w_q$, Quadrature weight at $x_q$\n" 173 " __local %s phi_i[%d]; //[N_bt*N_q]; // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n" 174 " __local %s%d phiDer_i[%d]; //[N_bt*N_q]; // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n" 175 " /* Geometric data */\n" 176 " __local %s detJ[%d]; //[N_t]; // $|J(x_q)|$, Jacobian determinant at $x_q$\n" 177 " __local %s invJ[%d];//[N_t*dim*dim]; // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n", 178 &count, numeric_str, numeric_str, N_b * N_c * N_q, numeric_str, dim, N_b * N_c * N_q, numeric_str, N_t, numeric_str, N_t * dim * dim)); 179 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 180 " /* FEM data */\n" 181 " __local %s u_i[%d]; //[N_t*N_bt]; // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n", 182 &count, numeric_str, N_t * N_b * N_c)); 183 if (useAux) { 184 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " __local %s a_i[%d]; //[N_t]; // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n", &count, numeric_str, N_t)); 185 } 186 if (useF0) { 187 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 188 " /* Intermediate calculations */\n" 189 " __local %s f_0[%d]; //[N_t*N_sqc]; // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", 190 &count, numeric_str, N_t * N_q)); 191 } 192 if (useF1) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " __local %s%d f_1[%d]; //[N_t*N_sqc]; // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", &count, numeric_str, dim, N_t * N_q)); 193 /* TODO: If using elasticity, put in mu/lambda coefficients */ 194 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 195 " /* Output data */\n" 196 " %s e_i; // Coefficient $e_i$ of the residual\n\n", 197 &count, numeric_str)); 198 /* One-time loads */ 199 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 200 " /* These should be generated inline */\n" 201 " /* Load quadrature weights */\n" 202 " w = weights[qidx];\n" 203 " /* Load basis tabulation \\phi_i for this cell */\n" 204 " if (tidx < N_bt*N_q) {\n" 205 " phi_i[tidx] = Basis[tidx];\n" 206 " phiDer_i[tidx] = BasisDerivatives[tidx];\n" 207 " }\n\n", 208 &count)); 209 /* Batch loads */ 210 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 211 " for (int batch = 0; batch < N_cb; ++batch) {\n" 212 " /* Load geometry */\n" 213 " detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n" 214 " for (int n = 0; n < dim*dim; ++n) {\n" 215 " const int offset = n*N_t;\n" 216 " invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n" 217 " }\n" 218 " /* Load coefficients u_i for this cell */\n" 219 " for (int n = 0; n < N_bt; ++n) {\n" 220 " const int offset = n*N_t;\n" 221 " u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n" 222 " }\n", 223 &count)); 224 if (useAux) { 225 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 226 " /* Load coefficients a_i for this cell */\n" 227 " /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n" 228 " a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n", 229 &count)); 230 } 231 /* Quadrature phase */ 232 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 233 " barrier(CLK_LOCAL_MEM_FENCE);\n" 234 "\n" 235 " /* Map coefficients to values at quadrature points */\n" 236 " for (int c = 0; c < N_sqc; ++c) {\n" 237 " const int cell = c*N_bl*N_b + blqidx;\n" 238 " const int fidx = (cell*N_q + qidx)*N_comp + cidx;\n", 239 &count)); 240 if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s u[%d]; //[N_comp]; // $u(x_q)$, Value of the field at $x_q$\n", &count, numeric_str, N_c)); 241 if (useFieldDer) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s%d gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n", &count, numeric_str, dim, N_c)); 242 if (useFieldAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s a[%d]; //[1]; // $a(x_q)$, Value of the auxiliary fields at $x_q$\n", &count, numeric_str, 1)); 243 if (useFieldDerAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s%d gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n", &count, numeric_str, dim, 1)); 244 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 245 "\n" 246 " for (int comp = 0; comp < N_comp; ++comp) {\n", 247 &count)); 248 if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " u[comp] = 0.0;\n", &count)); 249 if (useFieldDer) { 250 switch (dim) { 251 case 1: 252 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0;\n", &count)); 253 break; 254 case 2: 255 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count)); 256 break; 257 case 3: 258 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count)); 259 break; 260 } 261 } 262 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " }\n", &count)); 263 if (useFieldAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " a[0] = 0.0;\n", &count)); 264 if (useFieldDerAux) { 265 switch (dim) { 266 case 1: 267 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0;\n", &count)); 268 break; 269 case 2: 270 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count)); 271 break; 272 case 3: 273 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count)); 274 break; 275 } 276 } 277 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 278 " /* Get field and derivatives at this quadrature point */\n" 279 " for (int i = 0; i < N_b; ++i) {\n" 280 " for (int comp = 0; comp < N_comp; ++comp) {\n" 281 " const int b = i*N_comp+comp;\n" 282 " const int pidx = qidx*N_bt + b;\n" 283 " const int uidx = cell*N_bt + b;\n" 284 " %s%d realSpaceDer;\n\n", 285 &count, numeric_str, dim)); 286 if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " u[comp] += u_i[uidx]*phi_i[pidx];\n", &count)); 287 if (useFieldDer) { 288 switch (dim) { 289 case 2: 290 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 291 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n" 292 " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n" 293 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n" 294 " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n", 295 &count)); 296 break; 297 case 3: 298 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 299 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n" 300 " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n" 301 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n" 302 " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n" 303 " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n" 304 " gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n", 305 &count)); 306 break; 307 } 308 } 309 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 310 " }\n" 311 " }\n", 312 &count)); 313 if (useFieldAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " a[0] += a_i[cell];\n", &count)); 314 /* Calculate residual at quadrature points: Should be generated by an weak form egine */ 315 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " /* Process values at quadrature points */\n", &count)); 316 switch (op) { 317 case LAPLACIAN: 318 if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count)); 319 if (useF1) { 320 if (useAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = a[0]*gradU[cidx];\n", &count)); 321 else PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = gradU[cidx];\n", &count)); 322 } 323 break; 324 case ELASTICITY: 325 if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count)); 326 if (useF1) { 327 switch (dim) { 328 case 2: 329 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 330 " switch (cidx) {\n" 331 " case 0:\n" 332 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n" 333 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n" 334 " break;\n" 335 " case 1:\n" 336 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n" 337 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n" 338 " }\n", 339 &count)); 340 break; 341 case 3: 342 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 343 " switch (cidx) {\n" 344 " case 0:\n" 345 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n" 346 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n" 347 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n" 348 " break;\n" 349 " case 1:\n" 350 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n" 351 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n" 352 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n" 353 " break;\n" 354 " case 2:\n" 355 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n" 356 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n" 357 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n" 358 " }\n", 359 &count)); 360 break; 361 } 362 } 363 break; 364 default: 365 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PDE operator %d is not supported", op); 366 } 367 if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] *= detJ[cell]*w;\n", &count)); 368 if (useF1) { 369 switch (dim) { 370 case 1: 371 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx].x *= detJ[cell]*w;\n", &count)); 372 break; 373 case 2: 374 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count)); 375 break; 376 case 3: 377 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count)); 378 break; 379 } 380 } 381 /* Thread transpose */ 382 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 383 " }\n\n" 384 " /* ==== TRANSPOSE THREADS ==== */\n" 385 " barrier(CLK_LOCAL_MEM_FENCE);\n\n", 386 &count)); 387 /* Basis phase */ 388 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 389 " /* Map values at quadrature points to coefficients */\n" 390 " for (int c = 0; c < N_sbc; ++c) {\n" 391 " const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n" 392 "\n" 393 " e_i = 0.0;\n" 394 " for (int q = 0; q < N_q; ++q) {\n" 395 " const int pidx = q*N_bt + bidx;\n" 396 " const int fidx = (cell*N_q + q)*N_comp + cidx;\n" 397 " %s%d realSpaceDer;\n\n", 398 &count, numeric_str, dim)); 399 400 if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " e_i += phi_i[pidx]*f_0[fidx];\n", &count)); 401 if (useF1) { 402 switch (dim) { 403 case 2: 404 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 405 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n" 406 " e_i += realSpaceDer.x*f_1[fidx].x;\n" 407 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n" 408 " e_i += realSpaceDer.y*f_1[fidx].y;\n", 409 &count)); 410 break; 411 case 3: 412 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 413 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n" 414 " e_i += realSpaceDer.x*f_1[fidx].x;\n" 415 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n" 416 " e_i += realSpaceDer.y*f_1[fidx].y;\n" 417 " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n" 418 " e_i += realSpaceDer.z*f_1[fidx].z;\n", 419 &count)); 420 break; 421 } 422 } 423 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 424 " }\n" 425 " /* Write element vector for N_{cbc} cells at a time */\n" 426 " elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n" 427 " }\n" 428 " /* ==== Could do one write per batch ==== */\n" 429 " }\n" 430 " return;\n" 431 "}\n", 432 &count)); 433 PetscFunctionReturn(PETSC_SUCCESS); 434 } 435 436 static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel) 437 { 438 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data; 439 PetscInt dim, N_bl; 440 PetscBool flg; 441 char *buffer; 442 size_t len; 443 char errMsg[8192]; 444 cl_int err; 445 446 PetscFunctionBegin; 447 PetscCall(PetscFEGetSpatialDimension(fem, &dim)); 448 PetscCall(PetscMalloc1(8192, &buffer)); 449 PetscCall(PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL)); 450 PetscCall(PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl)); 451 PetscCall(PetscOptionsHasName(((PetscObject)fem)->options, ((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg)); 452 if (flg) PetscCall(PetscPrintf(PetscObjectComm((PetscObject)fem), "OpenCL FE Integration Kernel:\n%s\n", buffer)); 453 PetscCall(PetscStrlen(buffer, &len)); 454 *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **)&buffer, &len, &err); 455 PetscCall(err); 456 err = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL); 457 if (err != CL_SUCCESS) { 458 err = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192 * sizeof(char), &errMsg, NULL); 459 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg); 460 } 461 PetscCall(PetscFree(buffer)); 462 *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &err); 463 PetscFunctionReturn(PETSC_SUCCESS); 464 } 465 466 static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z) 467 { 468 const PetscInt Nblocks = N / blockSize; 469 470 PetscFunctionBegin; 471 PetscCheck(!(N % blockSize), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N); 472 *z = 1; 473 *y = 1; 474 for (*x = (size_t)(PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) { 475 *y = Nblocks / *x; 476 if (*x * *y == (size_t)Nblocks) break; 477 } 478 PetscCheck(*x * *y == (size_t)Nblocks, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %" PetscInt_FMT " with block size %" PetscInt_FMT, N, blockSize); 479 PetscFunctionReturn(PETSC_SUCCESS); 480 } 481 482 static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops) 483 { 484 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data; 485 PetscStageLog stageLog; 486 PetscEventPerfLog eventLog = NULL; 487 int stage; 488 489 PetscFunctionBegin; 490 PetscCall(PetscLogGetStageLog(&stageLog)); 491 PetscCall(PetscStageLogGetCurrent(stageLog, &stage)); 492 PetscCall(PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog)); 493 /* Log performance info */ 494 eventLog->eventInfo[ocl->residualEvent].count++; 495 eventLog->eventInfo[ocl->residualEvent].time += time; 496 eventLog->eventInfo[ocl->residualEvent].flops += flops; 497 PetscFunctionReturn(PETSC_SUCCESS); 498 } 499 500 static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscFormKey key, PetscInt Ne, PetscFEGeom *cgeom, const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[]) 501 { 502 /* Nbc = batchSize */ 503 PetscFE fem; 504 PetscFE_OpenCL *ocl; 505 PetscPointFunc f0_func; 506 PetscPointFunc f1_func; 507 PetscQuadrature q; 508 PetscInt dim, qNc; 509 PetscInt N_b; /* The number of basis functions */ 510 PetscInt N_comp; /* The number of basis function components */ 511 PetscInt N_bt; /* The total number of scalar basis functions */ 512 PetscInt N_q; /* The number of quadrature points */ 513 PetscInt N_bst; /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */ 514 PetscInt N_t; /* The number of threads, N_bst * N_bl */ 515 PetscInt N_bl; /* The number of blocks */ 516 PetscInt N_bc; /* The batch size, N_bl*N_q*N_b */ 517 PetscInt N_cb; /* The number of batches */ 518 const PetscInt field = key.field; 519 PetscInt numFlops, f0Flops = 0, f1Flops = 0; 520 PetscBool useAux = probAux ? PETSC_TRUE : PETSC_FALSE; 521 PetscBool useField = PETSC_FALSE; 522 PetscBool useFieldDer = PETSC_TRUE; 523 PetscBool useF0 = PETSC_TRUE; 524 PetscBool useF1 = PETSC_TRUE; 525 /* OpenCL variables */ 526 cl_program ocl_prog; 527 cl_kernel ocl_kernel; 528 cl_event ocl_ev; /* The event for tracking kernel execution */ 529 cl_ulong ns_start; /* Nanoseconds counter on GPU at kernel start */ 530 cl_ulong ns_end; /* Nanoseconds counter on GPU at kernel stop */ 531 cl_mem o_jacobianInverses, o_jacobianDeterminants; 532 cl_mem o_coefficients, o_coefficientsAux, o_elemVec; 533 float *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL; 534 double *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL; 535 PetscReal *r_invJ = NULL, *r_detJ = NULL; 536 void *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ; 537 size_t local_work_size[3], global_work_size[3]; 538 size_t realSize, x, y, z; 539 const PetscReal *points, *weights; 540 int err; 541 542 PetscFunctionBegin; 543 PetscCall(PetscDSGetDiscretization(prob, field, (PetscObject *)&fem)); 544 ocl = (PetscFE_OpenCL *)fem->data; 545 if (!Ne) { 546 PetscCall(PetscFEOpenCLLogResidual(fem, 0.0, 0.0)); 547 PetscFunctionReturn(PETSC_SUCCESS); 548 } 549 PetscCall(PetscFEGetSpatialDimension(fem, &dim)); 550 PetscCall(PetscFEGetQuadrature(fem, &q)); 551 PetscCall(PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights)); 552 PetscCheck(qNc == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %" PetscInt_FMT " components", qNc); 553 PetscCall(PetscFEGetDimension(fem, &N_b)); 554 PetscCall(PetscFEGetNumComponents(fem, &N_comp)); 555 PetscCall(PetscDSGetResidual(prob, field, &f0_func, &f1_func)); 556 PetscCall(PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb)); 557 N_bt = N_b * N_comp; 558 N_bst = N_bt * N_q; 559 N_t = N_bst * N_bl; 560 PetscCheck(N_bc * N_comp == N_t, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp); 561 /* Calculate layout */ 562 if (Ne % (N_cb * N_bc)) { /* Remainder cells */ 563 PetscCall(PetscFEIntegrateResidual_Basic(prob, key, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec)); 564 PetscFunctionReturn(PETSC_SUCCESS); 565 } 566 PetscCall(PetscFEOpenCLCalculateGrid(fem, Ne, N_cb * N_bc, &x, &y, &z)); 567 local_work_size[0] = N_bc * N_comp; 568 local_work_size[1] = 1; 569 local_work_size[2] = 1; 570 global_work_size[0] = x * local_work_size[0]; 571 global_work_size[1] = y * local_work_size[1]; 572 global_work_size[2] = z * local_work_size[2]; 573 PetscCall(PetscInfo(fem, "GPU layout grid(%zu,%zu,%zu) block(%zu,%zu,%zu) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb)); 574 PetscCall(PetscInfo(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb)); 575 /* Generate code */ 576 if (probAux) { 577 PetscSpace P; 578 PetscInt NfAux, order, f; 579 580 PetscCall(PetscDSGetNumFields(probAux, &NfAux)); 581 for (f = 0; f < NfAux; ++f) { 582 PetscFE feAux; 583 584 PetscCall(PetscDSGetDiscretization(probAux, f, (PetscObject *)&feAux)); 585 PetscCall(PetscFEGetBasisSpace(feAux, &P)); 586 PetscCall(PetscSpaceGetDegree(P, &order, NULL)); 587 PetscCheck(order <= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields"); 588 } 589 } 590 PetscCall(PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel)); 591 /* Create buffers on the device and send data over */ 592 PetscCall(PetscDataTypeGetSize(ocl->realType, &realSize)); 593 PetscCheck(cgeom->numPoints <= 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only support affine geometry for OpenCL integration right now"); 594 if (sizeof(PetscReal) != realSize) { 595 switch (ocl->realType) { 596 case PETSC_FLOAT: { 597 PetscInt c, b, d; 598 599 PetscCall(PetscMalloc4(Ne * N_bt, &f_coeff, Ne, &f_coeffAux, Ne * dim * dim, &f_invJ, Ne, &f_detJ)); 600 for (c = 0; c < Ne; ++c) { 601 f_detJ[c] = (float)cgeom->detJ[c]; 602 for (d = 0; d < dim * dim; ++d) f_invJ[c * dim * dim + d] = (float)cgeom->invJ[c * dim * dim + d]; 603 for (b = 0; b < N_bt; ++b) f_coeff[c * N_bt + b] = (float)coefficients[c * N_bt + b]; 604 } 605 if (coefficientsAux) { /* Assume P0 */ 606 for (c = 0; c < Ne; ++c) f_coeffAux[c] = (float)coefficientsAux[c]; 607 } 608 oclCoeff = (void *)f_coeff; 609 if (coefficientsAux) { 610 oclCoeffAux = (void *)f_coeffAux; 611 } else { 612 oclCoeffAux = NULL; 613 } 614 oclInvJ = (void *)f_invJ; 615 oclDetJ = (void *)f_detJ; 616 } break; 617 case PETSC_DOUBLE: { 618 PetscInt c, b, d; 619 620 PetscCall(PetscMalloc4(Ne * N_bt, &d_coeff, Ne, &d_coeffAux, Ne * dim * dim, &d_invJ, Ne, &d_detJ)); 621 for (c = 0; c < Ne; ++c) { 622 d_detJ[c] = (double)cgeom->detJ[c]; 623 for (d = 0; d < dim * dim; ++d) d_invJ[c * dim * dim + d] = (double)cgeom->invJ[c * dim * dim + d]; 624 for (b = 0; b < N_bt; ++b) d_coeff[c * N_bt + b] = (double)coefficients[c * N_bt + b]; 625 } 626 if (coefficientsAux) { /* Assume P0 */ 627 for (c = 0; c < Ne; ++c) d_coeffAux[c] = (double)coefficientsAux[c]; 628 } 629 oclCoeff = (void *)d_coeff; 630 if (coefficientsAux) { 631 oclCoeffAux = (void *)d_coeffAux; 632 } else { 633 oclCoeffAux = NULL; 634 } 635 oclInvJ = (void *)d_invJ; 636 oclDetJ = (void *)d_detJ; 637 } break; 638 default: 639 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType); 640 } 641 } else { 642 PetscInt c, d; 643 644 PetscCall(PetscMalloc2(Ne * dim * dim, &r_invJ, Ne, &r_detJ)); 645 for (c = 0; c < Ne; ++c) { 646 r_detJ[c] = cgeom->detJ[c]; 647 for (d = 0; d < dim * dim; ++d) r_invJ[c * dim * dim + d] = cgeom->invJ[c * dim * dim + d]; 648 } 649 oclCoeff = (void *)coefficients; 650 oclCoeffAux = (void *)coefficientsAux; 651 oclInvJ = (void *)r_invJ; 652 oclDetJ = (void *)r_detJ; 653 } 654 o_coefficients = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * N_bt * realSize, oclCoeff, &err); 655 if (coefficientsAux) { 656 o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclCoeffAux, &err); 657 } else { 658 o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY, Ne * realSize, oclCoeffAux, &err); 659 } 660 o_jacobianInverses = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * dim * dim * realSize, oclInvJ, &err); 661 o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclDetJ, &err); 662 o_elemVec = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY, Ne * N_bt * realSize, NULL, &err); 663 /* Kernel launch */ 664 PetscCall(clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void *)&N_cb)); 665 PetscCall(clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void *)&o_coefficients)); 666 PetscCall(clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void *)&o_coefficientsAux)); 667 PetscCall(clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void *)&o_jacobianInverses)); 668 PetscCall(clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void *)&o_jacobianDeterminants)); 669 PetscCall(clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void *)&o_elemVec)); 670 PetscCall(clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev)); 671 /* Read data back from device */ 672 if (sizeof(PetscReal) != realSize) { 673 switch (ocl->realType) { 674 case PETSC_FLOAT: { 675 float *elem; 676 PetscInt c, b; 677 678 PetscCall(PetscFree4(f_coeff, f_coeffAux, f_invJ, f_detJ)); 679 PetscCall(PetscMalloc1(Ne * N_bt, &elem)); 680 PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elem, 0, NULL, NULL)); 681 for (c = 0; c < Ne; ++c) { 682 for (b = 0; b < N_bt; ++b) elemVec[c * N_bt + b] = (PetscScalar)elem[c * N_bt + b]; 683 } 684 PetscCall(PetscFree(elem)); 685 } break; 686 case PETSC_DOUBLE: { 687 double *elem; 688 PetscInt c, b; 689 690 PetscCall(PetscFree4(d_coeff, d_coeffAux, d_invJ, d_detJ)); 691 PetscCall(PetscMalloc1(Ne * N_bt, &elem)); 692 PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elem, 0, NULL, NULL)); 693 for (c = 0; c < Ne; ++c) { 694 for (b = 0; b < N_bt; ++b) elemVec[c * N_bt + b] = (PetscScalar)elem[c * N_bt + b]; 695 } 696 PetscCall(PetscFree(elem)); 697 } break; 698 default: 699 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType); 700 } 701 } else { 702 PetscCall(PetscFree2(r_invJ, r_detJ)); 703 PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elemVec, 0, NULL, NULL)); 704 } 705 /* Log performance */ 706 PetscCall(clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL)); 707 PetscCall(clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL)); 708 f0Flops = 0; 709 switch (ocl->op) { 710 case LAPLACIAN: 711 f1Flops = useAux ? dim : 0; 712 break; 713 case ELASTICITY: 714 f1Flops = 2 * dim * dim; 715 break; 716 } 717 numFlops = Ne * (N_q * (N_b * N_comp * ((useField ? 2 : 0) + (useFieldDer ? 2 * dim * (dim + 1) : 0)) 718 /*+ 719 N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/ 720 + N_comp * ((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2 * dim : 0))) + 721 N_b * ((useF0 ? 2 : 0) + (useF1 ? 2 * dim * (dim + 1) : 0))); 722 PetscCall(PetscFEOpenCLLogResidual(fem, (ns_end - ns_start) * 1.0e-9, numFlops)); 723 /* Cleanup */ 724 PetscCall(clReleaseMemObject(o_coefficients)); 725 PetscCall(clReleaseMemObject(o_coefficientsAux)); 726 PetscCall(clReleaseMemObject(o_jacobianInverses)); 727 PetscCall(clReleaseMemObject(o_jacobianDeterminants)); 728 PetscCall(clReleaseMemObject(o_elemVec)); 729 PetscCall(clReleaseKernel(ocl_kernel)); 730 PetscCall(clReleaseProgram(ocl_prog)); 731 PetscFunctionReturn(PETSC_SUCCESS); 732 } 733 734 PETSC_INTERN PetscErrorCode PetscFESetUp_Basic(PetscFE); 735 PETSC_INTERN PetscErrorCode PetscFECreateTabulation_Basic(PetscFE, PetscInt, const PetscReal[], PetscInt, PetscTabulation); 736 737 static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem) 738 { 739 PetscFunctionBegin; 740 fem->ops->setfromoptions = NULL; 741 fem->ops->setup = PetscFESetUp_Basic; 742 fem->ops->view = NULL; 743 fem->ops->destroy = PetscFEDestroy_OpenCL; 744 fem->ops->getdimension = PetscFEGetDimension_Basic; 745 fem->ops->createtabulation = PetscFECreateTabulation_Basic; 746 fem->ops->integrateresidual = PetscFEIntegrateResidual_OpenCL; 747 fem->ops->integratebdresidual = NULL /* PetscFEIntegrateBdResidual_OpenCL */; 748 fem->ops->integratejacobianaction = NULL /* PetscFEIntegrateJacobianAction_OpenCL */; 749 fem->ops->integratejacobian = PetscFEIntegrateJacobian_Basic; 750 PetscFunctionReturn(PETSC_SUCCESS); 751 } 752 753 /*MC 754 PETSCFEOPENCL = "opencl" - A `PetscFEType` that integrates using a vectorized OpenCL implementation 755 756 Level: intermediate 757 758 .seealso: `PetscFEType`, `PetscFECreate()`, `PetscFESetType()` 759 M*/ 760 761 PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem) 762 { 763 PetscFE_OpenCL *ocl; 764 cl_uint num_platforms; 765 cl_platform_id platform_ids[42]; 766 cl_uint num_devices; 767 cl_device_id device_ids[42]; 768 cl_int err; 769 770 PetscFunctionBegin; 771 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 772 PetscCall(PetscNew(&ocl)); 773 fem->data = ocl; 774 775 /* Init Platform */ 776 PetscCall(clGetPlatformIDs(42, platform_ids, &num_platforms)); 777 PetscCheck(num_platforms, PetscObjectComm((PetscObject)fem), PETSC_ERR_SUP, "No OpenCL platform found."); 778 ocl->pf_id = platform_ids[0]; 779 /* Init Device */ 780 PetscCall(clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices)); 781 PetscCheck(num_devices, PetscObjectComm((PetscObject)fem), PETSC_ERR_SUP, "No OpenCL device found."); 782 ocl->dev_id = device_ids[0]; 783 /* Create context with one command queue */ 784 ocl->ctx_id = clCreateContext(0, 1, &(ocl->dev_id), NULL, NULL, &err); 785 PetscCall(err); 786 ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &err); 787 PetscCall(err); 788 /* Types */ 789 ocl->realType = PETSC_FLOAT; 790 /* Register events */ 791 PetscCall(PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent)); 792 /* Equation handling */ 793 ocl->op = LAPLACIAN; 794 795 PetscCall(PetscFEInitialize_OpenCL(fem)); 796 PetscFunctionReturn(PETSC_SUCCESS); 797 } 798 799 /*@ 800 PetscFEOpenCLSetRealType - Set the scalar type for running on the OpenCL accelerator 801 802 Input Parameters: 803 + fem - The `PetscFE` 804 - realType - The scalar type 805 806 Level: developer 807 808 .seealso: `PetscFE`, `PetscFEOpenCLGetRealType()` 809 @*/ 810 PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType) 811 { 812 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data; 813 814 PetscFunctionBegin; 815 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 816 ocl->realType = realType; 817 PetscFunctionReturn(PETSC_SUCCESS); 818 } 819 820 /*@ 821 PetscFEOpenCLGetRealType - Get the scalar type for running on the OpenCL accelerator 822 823 Input Parameter: 824 . fem - The `PetscFE` 825 826 Output Parameter: 827 . realType - The scalar type 828 829 Level: developer 830 831 .seealso: `PetscFE`, `PetscFEOpenCLSetRealType()` 832 @*/ 833 PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType) 834 { 835 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data; 836 837 PetscFunctionBegin; 838 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 839 PetscAssertPointer(realType, 2); 840 *realType = ocl->realType; 841 PetscFunctionReturn(PETSC_SUCCESS); 842 } 843 844 #endif /* PETSC_HAVE_OPENCL */ 845