1 #include <petsc/private/petscfeimpl.h> /*I "petscfe.h" I*/ 2 #include <petsc/private/loghandlerimpl.h> 3 #include <../src/sys/logging/handler/impls/default/logdefault.h> 4 5 #if defined(PETSC_HAVE_OPENCL) 6 7 static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem) 8 { 9 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data; 10 11 PetscFunctionBegin; 12 PetscCall(clReleaseCommandQueue(ocl->queue_id)); 13 ocl->queue_id = 0; 14 PetscCall(clReleaseContext(ocl->ctx_id)); 15 ocl->ctx_id = 0; 16 PetscCall(PetscFree(ocl)); 17 PetscFunctionReturn(PETSC_SUCCESS); 18 } 19 20 #define PetscCallSTR(err) \ 21 do { \ 22 PetscCall(err); \ 23 string_tail += count; \ 24 PetscCheck(string_tail != end_of_buffer, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Buffer overflow"); \ 25 } while (0) 26 enum { 27 LAPLACIAN = 0, 28 ELASTICITY = 1 29 }; 30 31 /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */ 32 /* dim Number of spatial dimensions: 2 */ 33 /* N_b Number of basis functions: generated */ 34 /* N_{bt} Number of total basis functions: N_b * N_{comp} */ 35 /* N_q Number of quadrature points: generated */ 36 /* N_{bs} Number of block cells LCM(N_b, N_q) */ 37 /* N_{bst} Number of block cell components LCM(N_{bt}, N_q) */ 38 /* N_{bl} Number of concurrent blocks generated */ 39 /* N_t Number of threads: N_{bl} * N_{bs} */ 40 /* N_{cbc} Number of concurrent basis cells: N_{bl} * N_q */ 41 /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b */ 42 /* N_{sbc} Number of serial basis cells: N_{bs} / N_q */ 43 /* N_{sqc} Number of serial quadrature cells: N_{bs} / N_b */ 44 /* N_{cb} Number of serial cell batches: input */ 45 /* N_c Number of total cells: N_{cb}*N_{t}/N_{comp} */ 46 static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl) 47 { 48 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data; 49 PetscQuadrature q; 50 char *string_tail = *string_buffer; 51 char *end_of_buffer = *string_buffer + buffer_length; 52 char float_str[] = "float", double_str[] = "double"; 53 char *numeric_str = &float_str[0]; 54 PetscInt op = ocl->op; 55 PetscBool useField = PETSC_FALSE; 56 PetscBool useFieldDer = PETSC_TRUE; 57 PetscBool useFieldAux = useAux; 58 PetscBool useFieldDerAux = PETSC_FALSE; 59 PetscBool useF0 = PETSC_TRUE; 60 PetscBool useF1 = PETSC_TRUE; 61 const PetscReal *points, *weights; 62 PetscTabulation T; 63 PetscInt dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c; 64 size_t count; 65 66 PetscFunctionBegin; 67 PetscCall(PetscFEGetSpatialDimension(fem, &dim)); 68 PetscCall(PetscFEGetDimension(fem, &N_b)); 69 PetscCall(PetscFEGetNumComponents(fem, &N_c)); 70 PetscCall(PetscFEGetQuadrature(fem, &q)); 71 PetscCall(PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights)); 72 PetscCheck(qNc == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %" PetscInt_FMT " components", qNc); 73 N_t = N_b * N_c * N_q * N_bl; 74 /* Enable device extension for double precision */ 75 if (ocl->realType == PETSC_DOUBLE) { 76 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 77 "#if defined(cl_khr_fp64)\n" 78 "# pragma OPENCL EXTENSION cl_khr_fp64: enable\n" 79 "#elif defined(cl_amd_fp64)\n" 80 "# pragma OPENCL EXTENSION cl_amd_fp64: enable\n" 81 "#endif\n", 82 &count)); 83 numeric_str = &double_str[0]; 84 } 85 /* Kernel API */ 86 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 87 "\n" 88 "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n" 89 "{\n", 90 &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str)); 91 /* Quadrature */ 92 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 93 " /* Quadrature points\n" 94 " - (x1,y1,x2,y2,...) */\n" 95 " const %s points[%d] = {\n", 96 &count, numeric_str, N_q * dim)); 97 for (p = 0; p < N_q; ++p) { 98 for (d = 0; d < dim; ++d) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p * dim + d])); 99 } 100 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count)); 101 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 102 " /* Quadrature weights\n" 103 " - (v1,v2,...) */\n" 104 " const %s weights[%d] = {\n", 105 &count, numeric_str, N_q)); 106 for (p = 0; p < N_q; ++p) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p])); 107 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count)); 108 /* Basis Functions */ 109 PetscCall(PetscFEGetCellTabulation(fem, 1, &T)); 110 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 111 " /* Nodal basis function evaluations\n" 112 " - basis component is fastest varying, the basis function, then point */\n" 113 " const %s Basis[%d] = {\n", 114 &count, numeric_str, N_q * N_b * N_c)); 115 for (p = 0; p < N_q; ++p) { 116 for (b = 0; b < N_b; ++b) { 117 for (c = 0; c < N_c; ++c) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p * N_b + b) * N_c + c])); 118 } 119 } 120 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count)); 121 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 122 "\n" 123 " /* Nodal basis function derivative evaluations,\n" 124 " - derivative direction is fastest varying, then basis component, then basis function, then point */\n" 125 " const %s%d BasisDerivatives[%d] = {\n", 126 &count, numeric_str, dim, N_q * N_b * N_c)); 127 for (p = 0; p < N_q; ++p) { 128 for (b = 0; b < N_b; ++b) { 129 for (c = 0; c < N_c; ++c) { 130 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim)); 131 for (d = 0; d < dim; ++d) { 132 if (d > 0) { 133 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p * N_b + b) * dim + d) * N_c + c])); 134 } else { 135 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p * N_b + b) * dim + d) * N_c + c])); 136 } 137 } 138 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count)); 139 } 140 } 141 } 142 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count)); 143 /* Sizes */ 144 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 145 " const int dim = %d; // The spatial dimension\n" 146 " const int N_bl = %d; // The number of concurrent blocks\n" 147 " const int N_b = %d; // The number of basis functions\n" 148 " const int N_comp = %d; // The number of basis function components\n" 149 " const int N_bt = N_b*N_comp; // The total number of scalar basis functions\n" 150 " const int N_q = %d; // The number of quadrature points\n" 151 " const int N_bst = N_bt*N_q; // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n" 152 " const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl\n" 153 " const int N_bc = N_t/N_comp; // The number of cells per batch (N_b*N_q*N_bl)\n" 154 " const int N_sbc = N_bst / (N_q * N_comp);\n" 155 " const int N_sqc = N_bst / N_bt;\n" 156 " /*const int N_c = N_cb * N_bc;*/\n" 157 "\n" 158 " /* Calculated indices */\n" 159 " /*const int tidx = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n" 160 " const int tidx = get_local_id(0);\n" 161 " const int blidx = tidx / N_bst; // Block number for this thread\n" 162 " const int bidx = tidx %% N_bt; // Basis function mapped to this thread\n" 163 " const int cidx = tidx %% N_comp; // Basis component mapped to this thread\n" 164 " const int qidx = tidx %% N_q; // Quadrature point mapped to this thread\n" 165 " const int blbidx = tidx %% N_q + blidx*N_q; // Cell mapped to this thread in the basis phase\n" 166 " const int blqidx = tidx %% N_b + blidx*N_b; // Cell mapped to this thread in the quadrature phase\n" 167 " const int gidx = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n" 168 " const int Goffset = gidx*N_cb*N_bc;\n", 169 &count, dim, N_bl, N_b, N_c, N_q)); 170 /* Local memory */ 171 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 172 "\n" 173 " /* Quadrature data */\n" 174 " %s w; // $w_q$, Quadrature weight at $x_q$\n" 175 " __local %s phi_i[%d]; //[N_bt*N_q]; // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n" 176 " __local %s%d phiDer_i[%d]; //[N_bt*N_q]; // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n" 177 " /* Geometric data */\n" 178 " __local %s detJ[%d]; //[N_t]; // $|J(x_q)|$, Jacobian determinant at $x_q$\n" 179 " __local %s invJ[%d];//[N_t*dim*dim]; // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n", 180 &count, numeric_str, numeric_str, N_b * N_c * N_q, numeric_str, dim, N_b * N_c * N_q, numeric_str, N_t, numeric_str, N_t * dim * dim)); 181 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 182 " /* FEM data */\n" 183 " __local %s u_i[%d]; //[N_t*N_bt]; // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n", 184 &count, numeric_str, N_t * N_b * N_c)); 185 if (useAux) { 186 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " __local %s a_i[%d]; //[N_t]; // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n", &count, numeric_str, N_t)); 187 } 188 if (useF0) { 189 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 190 " /* Intermediate calculations */\n" 191 " __local %s f_0[%d]; //[N_t*N_sqc]; // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", 192 &count, numeric_str, N_t * N_q)); 193 } 194 if (useF1) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " __local %s%d f_1[%d]; //[N_t*N_sqc]; // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", &count, numeric_str, dim, N_t * N_q)); 195 /* TODO: If using elasticity, put in mu/lambda coefficients */ 196 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 197 " /* Output data */\n" 198 " %s e_i; // Coefficient $e_i$ of the residual\n\n", 199 &count, numeric_str)); 200 /* One-time loads */ 201 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 202 " /* These should be generated inline */\n" 203 " /* Load quadrature weights */\n" 204 " w = weights[qidx];\n" 205 " /* Load basis tabulation \\phi_i for this cell */\n" 206 " if (tidx < N_bt*N_q) {\n" 207 " phi_i[tidx] = Basis[tidx];\n" 208 " phiDer_i[tidx] = BasisDerivatives[tidx];\n" 209 " }\n\n", 210 &count)); 211 /* Batch loads */ 212 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 213 " for (int batch = 0; batch < N_cb; ++batch) {\n" 214 " /* Load geometry */\n" 215 " detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n" 216 " for (int n = 0; n < dim*dim; ++n) {\n" 217 " const int offset = n*N_t;\n" 218 " invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n" 219 " }\n" 220 " /* Load coefficients u_i for this cell */\n" 221 " for (int n = 0; n < N_bt; ++n) {\n" 222 " const int offset = n*N_t;\n" 223 " u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n" 224 " }\n", 225 &count)); 226 if (useAux) { 227 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 228 " /* Load coefficients a_i for this cell */\n" 229 " /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n" 230 " a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n", 231 &count)); 232 } 233 /* Quadrature phase */ 234 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 235 " barrier(CLK_LOCAL_MEM_FENCE);\n" 236 "\n" 237 " /* Map coefficients to values at quadrature points */\n" 238 " for (int c = 0; c < N_sqc; ++c) {\n" 239 " const int cell = c*N_bl*N_b + blqidx;\n" 240 " const int fidx = (cell*N_q + qidx)*N_comp + cidx;\n", 241 &count)); 242 if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s u[%d]; //[N_comp]; // $u(x_q)$, Value of the field at $x_q$\n", &count, numeric_str, N_c)); 243 if (useFieldDer) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s%d gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n", &count, numeric_str, dim, N_c)); 244 if (useFieldAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s a[%d]; //[1]; // $a(x_q)$, Value of the auxiliary fields at $x_q$\n", &count, numeric_str, 1)); 245 if (useFieldDerAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s%d gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n", &count, numeric_str, dim, 1)); 246 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 247 "\n" 248 " for (int comp = 0; comp < N_comp; ++comp) {\n", 249 &count)); 250 if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " u[comp] = 0.0;\n", &count)); 251 if (useFieldDer) { 252 switch (dim) { 253 case 1: 254 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0;\n", &count)); 255 break; 256 case 2: 257 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count)); 258 break; 259 case 3: 260 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count)); 261 break; 262 } 263 } 264 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " }\n", &count)); 265 if (useFieldAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " a[0] = 0.0;\n", &count)); 266 if (useFieldDerAux) { 267 switch (dim) { 268 case 1: 269 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0;\n", &count)); 270 break; 271 case 2: 272 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count)); 273 break; 274 case 3: 275 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count)); 276 break; 277 } 278 } 279 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 280 " /* Get field and derivatives at this quadrature point */\n" 281 " for (int i = 0; i < N_b; ++i) {\n" 282 " for (int comp = 0; comp < N_comp; ++comp) {\n" 283 " const int b = i*N_comp+comp;\n" 284 " const int pidx = qidx*N_bt + b;\n" 285 " const int uidx = cell*N_bt + b;\n" 286 " %s%d realSpaceDer;\n\n", 287 &count, numeric_str, dim)); 288 if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " u[comp] += u_i[uidx]*phi_i[pidx];\n", &count)); 289 if (useFieldDer) { 290 switch (dim) { 291 case 2: 292 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 293 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n" 294 " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n" 295 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n" 296 " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n", 297 &count)); 298 break; 299 case 3: 300 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 301 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n" 302 " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n" 303 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n" 304 " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n" 305 " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n" 306 " gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n", 307 &count)); 308 break; 309 } 310 } 311 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 312 " }\n" 313 " }\n", 314 &count)); 315 if (useFieldAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " a[0] += a_i[cell];\n", &count)); 316 /* Calculate residual at quadrature points: Should be generated by an weak form engine */ 317 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " /* Process values at quadrature points */\n", &count)); 318 switch (op) { 319 case LAPLACIAN: 320 if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count)); 321 if (useF1) { 322 if (useAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = a[0]*gradU[cidx];\n", &count)); 323 else PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = gradU[cidx];\n", &count)); 324 } 325 break; 326 case ELASTICITY: 327 if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count)); 328 if (useF1) { 329 switch (dim) { 330 case 2: 331 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 332 " switch (cidx) {\n" 333 " case 0:\n" 334 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n" 335 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n" 336 " break;\n" 337 " case 1:\n" 338 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n" 339 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n" 340 " }\n", 341 &count)); 342 break; 343 case 3: 344 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 345 " switch (cidx) {\n" 346 " case 0:\n" 347 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n" 348 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n" 349 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n" 350 " break;\n" 351 " case 1:\n" 352 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n" 353 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n" 354 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n" 355 " break;\n" 356 " case 2:\n" 357 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n" 358 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n" 359 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n" 360 " }\n", 361 &count)); 362 break; 363 } 364 } 365 break; 366 default: 367 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PDE operator %d is not supported", op); 368 } 369 if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] *= detJ[cell]*w;\n", &count)); 370 if (useF1) { 371 switch (dim) { 372 case 1: 373 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx].x *= detJ[cell]*w;\n", &count)); 374 break; 375 case 2: 376 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count)); 377 break; 378 case 3: 379 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count)); 380 break; 381 } 382 } 383 /* Thread transpose */ 384 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 385 " }\n\n" 386 " /* ==== TRANSPOSE THREADS ==== */\n" 387 " barrier(CLK_LOCAL_MEM_FENCE);\n\n", 388 &count)); 389 /* Basis phase */ 390 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 391 " /* Map values at quadrature points to coefficients */\n" 392 " for (int c = 0; c < N_sbc; ++c) {\n" 393 " const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n" 394 "\n" 395 " e_i = 0.0;\n" 396 " for (int q = 0; q < N_q; ++q) {\n" 397 " const int pidx = q*N_bt + bidx;\n" 398 " const int fidx = (cell*N_q + q)*N_comp + cidx;\n" 399 " %s%d realSpaceDer;\n\n", 400 &count, numeric_str, dim)); 401 402 if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " e_i += phi_i[pidx]*f_0[fidx];\n", &count)); 403 if (useF1) { 404 switch (dim) { 405 case 2: 406 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 407 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n" 408 " e_i += realSpaceDer.x*f_1[fidx].x;\n" 409 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n" 410 " e_i += realSpaceDer.y*f_1[fidx].y;\n", 411 &count)); 412 break; 413 case 3: 414 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 415 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n" 416 " e_i += realSpaceDer.x*f_1[fidx].x;\n" 417 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n" 418 " e_i += realSpaceDer.y*f_1[fidx].y;\n" 419 " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n" 420 " e_i += realSpaceDer.z*f_1[fidx].z;\n", 421 &count)); 422 break; 423 } 424 } 425 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 426 " }\n" 427 " /* Write element vector for N_{cbc} cells at a time */\n" 428 " elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n" 429 " }\n" 430 " /* ==== Could do one write per batch ==== */\n" 431 " }\n" 432 " return;\n" 433 "}\n", 434 &count)); 435 PetscFunctionReturn(PETSC_SUCCESS); 436 } 437 438 static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel) 439 { 440 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data; 441 PetscInt dim, N_bl; 442 PetscBool flg; 443 char *buffer; 444 size_t len; 445 char errMsg[8192]; 446 cl_int err; 447 448 PetscFunctionBegin; 449 PetscCall(PetscFEGetSpatialDimension(fem, &dim)); 450 PetscCall(PetscMalloc1(8192, &buffer)); 451 PetscCall(PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL)); 452 PetscCall(PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl)); 453 PetscCall(PetscOptionsHasName(((PetscObject)fem)->options, ((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg)); 454 if (flg) PetscCall(PetscPrintf(PetscObjectComm((PetscObject)fem), "OpenCL FE Integration Kernel:\n%s\n", buffer)); 455 PetscCall(PetscStrlen(buffer, &len)); 456 *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **)&buffer, &len, &err); 457 PetscCall(err); 458 err = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL); 459 if (err != CL_SUCCESS) { 460 err = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192 * sizeof(char), &errMsg, NULL); 461 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg); 462 } 463 PetscCall(PetscFree(buffer)); 464 *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &err); 465 PetscFunctionReturn(PETSC_SUCCESS); 466 } 467 468 static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z) 469 { 470 const PetscInt Nblocks = N / blockSize; 471 472 PetscFunctionBegin; 473 PetscCheck(!(N % blockSize), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N); 474 *z = 1; 475 *y = 1; 476 for (*x = (size_t)(PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) { 477 *y = Nblocks / *x; 478 if (*x * *y == (size_t)Nblocks) break; 479 } 480 PetscCheck(*x * *y == (size_t)Nblocks, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %" PetscInt_FMT " with block size %" PetscInt_FMT, N, blockSize); 481 PetscFunctionReturn(PETSC_SUCCESS); 482 } 483 484 static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops) 485 { 486 PetscLogHandler h; 487 488 PetscFunctionBegin; 489 PetscCall(PetscLogGetDefaultHandler(&h)); 490 if (h) { 491 PetscEventPerfInfo *eventInfo; 492 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data; 493 494 PetscCall(PetscLogHandlerGetEventPerfInfo(h, PETSC_DEFAULT, ocl->residualEvent, &eventInfo)); 495 eventInfo->count++; 496 eventInfo->time += time; 497 eventInfo->flops += flops; 498 } 499 PetscFunctionReturn(PETSC_SUCCESS); 500 } 501 502 static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscFormKey key, PetscInt Ne, PetscFEGeom *cgeom, const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[]) 503 { 504 /* Nbc = batchSize */ 505 PetscFE fem; 506 PetscFE_OpenCL *ocl; 507 PetscPointFn *f0_func; 508 PetscPointFn *f1_func; 509 PetscQuadrature q; 510 PetscInt dim, qNc; 511 PetscInt N_b; /* The number of basis functions */ 512 PetscInt N_comp; /* The number of basis function components */ 513 PetscInt N_bt; /* The total number of scalar basis functions */ 514 PetscInt N_q; /* The number of quadrature points */ 515 PetscInt N_bst; /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */ 516 PetscInt N_t; /* The number of threads, N_bst * N_bl */ 517 PetscInt N_bl; /* The number of blocks */ 518 PetscInt N_bc; /* The batch size, N_bl*N_q*N_b */ 519 PetscInt N_cb; /* The number of batches */ 520 const PetscInt field = key.field; 521 PetscInt numFlops, f0Flops = 0, f1Flops = 0; 522 PetscBool useAux = probAux ? PETSC_TRUE : PETSC_FALSE; 523 PetscBool useField = PETSC_FALSE; 524 PetscBool useFieldDer = PETSC_TRUE; 525 PetscBool useF0 = PETSC_TRUE; 526 PetscBool useF1 = PETSC_TRUE; 527 /* OpenCL variables */ 528 cl_program ocl_prog; 529 cl_kernel ocl_kernel; 530 cl_event ocl_ev; /* The event for tracking kernel execution */ 531 cl_ulong ns_start; /* Nanoseconds counter on GPU at kernel start */ 532 cl_ulong ns_end; /* Nanoseconds counter on GPU at kernel stop */ 533 cl_mem o_jacobianInverses, o_jacobianDeterminants; 534 cl_mem o_coefficients, o_coefficientsAux, o_elemVec; 535 float *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL; 536 double *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL; 537 PetscReal *r_invJ = NULL, *r_detJ = NULL; 538 void *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ; 539 size_t local_work_size[3], global_work_size[3]; 540 size_t realSize, x, y, z; 541 const PetscReal *points, *weights; 542 int err; 543 544 PetscFunctionBegin; 545 PetscCall(PetscDSGetDiscretization(prob, field, (PetscObject *)&fem)); 546 ocl = (PetscFE_OpenCL *)fem->data; 547 if (!Ne) { 548 PetscCall(PetscFEOpenCLLogResidual(fem, 0.0, 0.0)); 549 PetscFunctionReturn(PETSC_SUCCESS); 550 } 551 PetscCall(PetscFEGetSpatialDimension(fem, &dim)); 552 PetscCall(PetscFEGetQuadrature(fem, &q)); 553 PetscCall(PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights)); 554 PetscCheck(qNc == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %" PetscInt_FMT " components", qNc); 555 PetscCall(PetscFEGetDimension(fem, &N_b)); 556 PetscCall(PetscFEGetNumComponents(fem, &N_comp)); 557 PetscCall(PetscDSGetResidual(prob, field, &f0_func, &f1_func)); 558 PetscCall(PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb)); 559 N_bt = N_b * N_comp; 560 N_bst = N_bt * N_q; 561 N_t = N_bst * N_bl; 562 PetscCheck(N_bc * N_comp == N_t, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp); 563 /* Calculate layout */ 564 if (Ne % (N_cb * N_bc)) { /* Remainder cells */ 565 PetscCall(PetscFEIntegrateResidual_Basic(prob, key, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec)); 566 PetscFunctionReturn(PETSC_SUCCESS); 567 } 568 PetscCall(PetscFEOpenCLCalculateGrid(fem, Ne, N_cb * N_bc, &x, &y, &z)); 569 local_work_size[0] = N_bc * N_comp; 570 local_work_size[1] = 1; 571 local_work_size[2] = 1; 572 global_work_size[0] = x * local_work_size[0]; 573 global_work_size[1] = y * local_work_size[1]; 574 global_work_size[2] = z * local_work_size[2]; 575 PetscCall(PetscInfo(fem, "GPU layout grid(%zu,%zu,%zu) block(%zu,%zu,%zu) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb)); 576 PetscCall(PetscInfo(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb)); 577 /* Generate code */ 578 if (probAux) { 579 PetscSpace P; 580 PetscInt NfAux, order, f; 581 582 PetscCall(PetscDSGetNumFields(probAux, &NfAux)); 583 for (f = 0; f < NfAux; ++f) { 584 PetscFE feAux; 585 586 PetscCall(PetscDSGetDiscretization(probAux, f, (PetscObject *)&feAux)); 587 PetscCall(PetscFEGetBasisSpace(feAux, &P)); 588 PetscCall(PetscSpaceGetDegree(P, &order, NULL)); 589 PetscCheck(order <= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields"); 590 } 591 } 592 PetscCall(PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel)); 593 /* Create buffers on the device and send data over */ 594 PetscCall(PetscDataTypeGetSize(ocl->realType, &realSize)); 595 PetscCheck(cgeom->numPoints <= 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only support affine geometry for OpenCL integration right now"); 596 if (sizeof(PetscReal) != realSize) { 597 switch (ocl->realType) { 598 case PETSC_FLOAT: { 599 PetscInt c, b, d; 600 601 PetscCall(PetscMalloc4(Ne * N_bt, &f_coeff, Ne, &f_coeffAux, Ne * dim * dim, &f_invJ, Ne, &f_detJ)); 602 for (c = 0; c < Ne; ++c) { 603 f_detJ[c] = (float)cgeom->detJ[c]; 604 for (d = 0; d < dim * dim; ++d) f_invJ[c * dim * dim + d] = (float)cgeom->invJ[c * dim * dim + d]; 605 for (b = 0; b < N_bt; ++b) f_coeff[c * N_bt + b] = (float)coefficients[c * N_bt + b]; 606 } 607 if (coefficientsAux) { /* Assume P0 */ 608 for (c = 0; c < Ne; ++c) f_coeffAux[c] = (float)coefficientsAux[c]; 609 } 610 oclCoeff = (void *)f_coeff; 611 if (coefficientsAux) { 612 oclCoeffAux = (void *)f_coeffAux; 613 } else { 614 oclCoeffAux = NULL; 615 } 616 oclInvJ = (void *)f_invJ; 617 oclDetJ = (void *)f_detJ; 618 } break; 619 case PETSC_DOUBLE: { 620 PetscInt c, b, d; 621 622 PetscCall(PetscMalloc4(Ne * N_bt, &d_coeff, Ne, &d_coeffAux, Ne * dim * dim, &d_invJ, Ne, &d_detJ)); 623 for (c = 0; c < Ne; ++c) { 624 d_detJ[c] = (double)cgeom->detJ[c]; 625 for (d = 0; d < dim * dim; ++d) d_invJ[c * dim * dim + d] = (double)cgeom->invJ[c * dim * dim + d]; 626 for (b = 0; b < N_bt; ++b) d_coeff[c * N_bt + b] = (double)coefficients[c * N_bt + b]; 627 } 628 if (coefficientsAux) { /* Assume P0 */ 629 for (c = 0; c < Ne; ++c) d_coeffAux[c] = (double)coefficientsAux[c]; 630 } 631 oclCoeff = (void *)d_coeff; 632 if (coefficientsAux) { 633 oclCoeffAux = (void *)d_coeffAux; 634 } else { 635 oclCoeffAux = NULL; 636 } 637 oclInvJ = (void *)d_invJ; 638 oclDetJ = (void *)d_detJ; 639 } break; 640 default: 641 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType); 642 } 643 } else { 644 PetscInt c, d; 645 646 PetscCall(PetscMalloc2(Ne * dim * dim, &r_invJ, Ne, &r_detJ)); 647 for (c = 0; c < Ne; ++c) { 648 r_detJ[c] = cgeom->detJ[c]; 649 for (d = 0; d < dim * dim; ++d) r_invJ[c * dim * dim + d] = cgeom->invJ[c * dim * dim + d]; 650 } 651 oclCoeff = (void *)coefficients; 652 oclCoeffAux = (void *)coefficientsAux; 653 oclInvJ = (void *)r_invJ; 654 oclDetJ = (void *)r_detJ; 655 } 656 o_coefficients = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * N_bt * realSize, oclCoeff, &err); 657 if (coefficientsAux) { 658 o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclCoeffAux, &err); 659 } else { 660 o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY, Ne * realSize, oclCoeffAux, &err); 661 } 662 o_jacobianInverses = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * dim * dim * realSize, oclInvJ, &err); 663 o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclDetJ, &err); 664 o_elemVec = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY, Ne * N_bt * realSize, NULL, &err); 665 /* Kernel launch */ 666 PetscCall(clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void *)&N_cb)); 667 PetscCall(clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void *)&o_coefficients)); 668 PetscCall(clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void *)&o_coefficientsAux)); 669 PetscCall(clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void *)&o_jacobianInverses)); 670 PetscCall(clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void *)&o_jacobianDeterminants)); 671 PetscCall(clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void *)&o_elemVec)); 672 PetscCall(clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev)); 673 /* Read data back from device */ 674 if (sizeof(PetscReal) != realSize) { 675 switch (ocl->realType) { 676 case PETSC_FLOAT: { 677 float *elem; 678 PetscInt c, b; 679 680 PetscCall(PetscFree4(f_coeff, f_coeffAux, f_invJ, f_detJ)); 681 PetscCall(PetscMalloc1(Ne * N_bt, &elem)); 682 PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elem, 0, NULL, NULL)); 683 for (c = 0; c < Ne; ++c) { 684 for (b = 0; b < N_bt; ++b) elemVec[c * N_bt + b] = (PetscScalar)elem[c * N_bt + b]; 685 } 686 PetscCall(PetscFree(elem)); 687 } break; 688 case PETSC_DOUBLE: { 689 double *elem; 690 PetscInt c, b; 691 692 PetscCall(PetscFree4(d_coeff, d_coeffAux, d_invJ, d_detJ)); 693 PetscCall(PetscMalloc1(Ne * N_bt, &elem)); 694 PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elem, 0, NULL, NULL)); 695 for (c = 0; c < Ne; ++c) { 696 for (b = 0; b < N_bt; ++b) elemVec[c * N_bt + b] = (PetscScalar)elem[c * N_bt + b]; 697 } 698 PetscCall(PetscFree(elem)); 699 } break; 700 default: 701 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType); 702 } 703 } else { 704 PetscCall(PetscFree2(r_invJ, r_detJ)); 705 PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elemVec, 0, NULL, NULL)); 706 } 707 /* Log performance */ 708 PetscCall(clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL)); 709 PetscCall(clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL)); 710 f0Flops = 0; 711 switch (ocl->op) { 712 case LAPLACIAN: 713 f1Flops = useAux ? dim : 0; 714 break; 715 case ELASTICITY: 716 f1Flops = 2 * dim * dim; 717 break; 718 } 719 numFlops = Ne * (N_q * (N_b * N_comp * ((useField ? 2 : 0) + (useFieldDer ? 2 * dim * (dim + 1) : 0)) 720 /*+ 721 N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/ 722 + N_comp * ((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2 * dim : 0))) + 723 N_b * ((useF0 ? 2 : 0) + (useF1 ? 2 * dim * (dim + 1) : 0))); 724 PetscCall(PetscFEOpenCLLogResidual(fem, (ns_end - ns_start) * 1.0e-9, numFlops)); 725 /* Cleanup */ 726 PetscCall(clReleaseMemObject(o_coefficients)); 727 PetscCall(clReleaseMemObject(o_coefficientsAux)); 728 PetscCall(clReleaseMemObject(o_jacobianInverses)); 729 PetscCall(clReleaseMemObject(o_jacobianDeterminants)); 730 PetscCall(clReleaseMemObject(o_elemVec)); 731 PetscCall(clReleaseKernel(ocl_kernel)); 732 PetscCall(clReleaseProgram(ocl_prog)); 733 PetscFunctionReturn(PETSC_SUCCESS); 734 } 735 736 PETSC_INTERN PetscErrorCode PetscFESetUp_Basic(PetscFE); 737 PETSC_INTERN PetscErrorCode PetscFEComputeTabulation_Basic(PetscFE, PetscInt, const PetscReal[], PetscInt, PetscTabulation); 738 739 static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem) 740 { 741 PetscFunctionBegin; 742 fem->ops->setfromoptions = NULL; 743 fem->ops->setup = PetscFESetUp_Basic; 744 fem->ops->view = NULL; 745 fem->ops->destroy = PetscFEDestroy_OpenCL; 746 fem->ops->getdimension = PetscFEGetDimension_Basic; 747 fem->ops->computetabulation = PetscFEComputeTabulation_Basic; 748 fem->ops->integrateresidual = PetscFEIntegrateResidual_OpenCL; 749 fem->ops->integratebdresidual = NULL /* PetscFEIntegrateBdResidual_OpenCL */; 750 fem->ops->integratejacobianaction = NULL /* PetscFEIntegrateJacobianAction_OpenCL */; 751 fem->ops->integratejacobian = PetscFEIntegrateJacobian_Basic; 752 PetscFunctionReturn(PETSC_SUCCESS); 753 } 754 755 /*MC 756 PETSCFEOPENCL = "opencl" - A `PetscFEType` that integrates using a vectorized OpenCL implementation 757 758 Level: intermediate 759 760 .seealso: `PetscFEType`, `PetscFECreate()`, `PetscFESetType()` 761 M*/ 762 763 PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem) 764 { 765 PetscFE_OpenCL *ocl; 766 cl_uint num_platforms; 767 cl_platform_id platform_ids[42]; 768 cl_uint num_devices; 769 cl_device_id device_ids[42]; 770 cl_int err; 771 772 PetscFunctionBegin; 773 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 774 PetscCall(PetscNew(&ocl)); 775 fem->data = ocl; 776 777 /* Init Platform */ 778 PetscCall(clGetPlatformIDs(42, platform_ids, &num_platforms)); 779 PetscCheck(num_platforms, PetscObjectComm((PetscObject)fem), PETSC_ERR_SUP, "No OpenCL platform found."); 780 ocl->pf_id = platform_ids[0]; 781 /* Init Device */ 782 PetscCall(clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices)); 783 PetscCheck(num_devices, PetscObjectComm((PetscObject)fem), PETSC_ERR_SUP, "No OpenCL device found."); 784 ocl->dev_id = device_ids[0]; 785 /* Create context with one command queue */ 786 ocl->ctx_id = clCreateContext(0, 1, &ocl->dev_id, NULL, NULL, &err); 787 PetscCall(err); 788 ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &err); 789 PetscCall(err); 790 /* Types */ 791 ocl->realType = PETSC_FLOAT; 792 /* Register events */ 793 PetscCall(PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent)); 794 /* Equation handling */ 795 ocl->op = LAPLACIAN; 796 797 PetscCall(PetscFEInitialize_OpenCL(fem)); 798 PetscFunctionReturn(PETSC_SUCCESS); 799 } 800 801 /*@ 802 PetscFEOpenCLSetRealType - Set the scalar type for running on the OpenCL accelerator 803 804 Input Parameters: 805 + fem - The `PetscFE` 806 - realType - The scalar type 807 808 Level: developer 809 810 .seealso: `PetscFE`, `PetscFEOpenCLGetRealType()` 811 @*/ 812 PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType) 813 { 814 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data; 815 816 PetscFunctionBegin; 817 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 818 ocl->realType = realType; 819 PetscFunctionReturn(PETSC_SUCCESS); 820 } 821 822 /*@ 823 PetscFEOpenCLGetRealType - Get the scalar type for running on the OpenCL accelerator 824 825 Input Parameter: 826 . fem - The `PetscFE` 827 828 Output Parameter: 829 . realType - The scalar type 830 831 Level: developer 832 833 .seealso: `PetscFE`, `PetscFEOpenCLSetRealType()` 834 @*/ 835 PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType) 836 { 837 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data; 838 839 PetscFunctionBegin; 840 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 841 PetscAssertPointer(realType, 2); 842 *realType = ocl->realType; 843 PetscFunctionReturn(PETSC_SUCCESS); 844 } 845 846 #endif /* PETSC_HAVE_OPENCL */ 847