1 #include <petsc/private/petscfeimpl.h> /*I "petscfe.h" I*/ 2 3 #if defined(PETSC_HAVE_OPENCL) 4 5 static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem) 6 { 7 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 8 9 PetscFunctionBegin; 10 PetscCall(clReleaseCommandQueue(ocl->queue_id)); 11 ocl->queue_id = 0; 12 PetscCall(clReleaseContext(ocl->ctx_id)); 13 ocl->ctx_id = 0; 14 PetscCall(PetscFree(ocl)); 15 PetscFunctionReturn(0); 16 } 17 18 #define PetscCallSTR(err) do {PetscCall(err); string_tail += count; PetscCheck(string_tail != end_of_buffer,PETSC_COMM_SELF, PETSC_ERR_PLIB,"Buffer overflow");} while (0) 19 enum {LAPLACIAN = 0, ELASTICITY = 1}; 20 21 /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */ 22 /* dim Number of spatial dimensions: 2 */ 23 /* N_b Number of basis functions: generated */ 24 /* N_{bt} Number of total basis functions: N_b * N_{comp} */ 25 /* N_q Number of quadrature points: generated */ 26 /* N_{bs} Number of block cells LCM(N_b, N_q) */ 27 /* N_{bst} Number of block cell components LCM(N_{bt}, N_q) */ 28 /* N_{bl} Number of concurrent blocks generated */ 29 /* N_t Number of threads: N_{bl} * N_{bs} */ 30 /* N_{cbc} Number of concurrent basis cells: N_{bl} * N_q */ 31 /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b */ 32 /* N_{sbc} Number of serial basis cells: N_{bs} / N_q */ 33 /* N_{sqc} Number of serial quadrature cells: N_{bs} / N_b */ 34 /* N_{cb} Number of serial cell batches: input */ 35 /* N_c Number of total cells: N_{cb}*N_{t}/N_{comp} */ 36 static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl) 37 { 38 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 39 PetscQuadrature q; 40 char *string_tail = *string_buffer; 41 char *end_of_buffer = *string_buffer + buffer_length; 42 char float_str[] = "float", double_str[] = "double"; 43 char *numeric_str = &(float_str[0]); 44 PetscInt op = ocl->op; 45 PetscBool useField = PETSC_FALSE; 46 PetscBool useFieldDer = PETSC_TRUE; 47 PetscBool useFieldAux = useAux; 48 PetscBool useFieldDerAux= PETSC_FALSE; 49 PetscBool useF0 = PETSC_TRUE; 50 PetscBool useF1 = PETSC_TRUE; 51 const PetscReal *points, *weights; 52 PetscTabulation T; 53 PetscInt dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c; 54 size_t count; 55 56 PetscFunctionBegin; 57 PetscCall(PetscFEGetSpatialDimension(fem, &dim)); 58 PetscCall(PetscFEGetDimension(fem, &N_b)); 59 PetscCall(PetscFEGetNumComponents(fem, &N_c)); 60 PetscCall(PetscFEGetQuadrature(fem, &q)); 61 PetscCall(PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights)); 62 PetscCheck(qNc == 1,PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %" PetscInt_FMT " components", qNc); 63 N_t = N_b * N_c * N_q * N_bl; 64 /* Enable device extension for double precision */ 65 if (ocl->realType == PETSC_DOUBLE) { 66 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 67 "#if defined(cl_khr_fp64)\n" 68 "# pragma OPENCL EXTENSION cl_khr_fp64: enable\n" 69 "#elif defined(cl_amd_fp64)\n" 70 "# pragma OPENCL EXTENSION cl_amd_fp64: enable\n" 71 "#endif\n", 72 &count)); 73 numeric_str = &(double_str[0]); 74 } 75 /* Kernel API */ 76 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 77 "\n" 78 "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n" 79 "{\n", 80 &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str)); 81 /* Quadrature */ 82 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 83 " /* Quadrature points\n" 84 " - (x1,y1,x2,y2,...) */\n" 85 " const %s points[%d] = {\n", 86 &count, numeric_str, N_q*dim)); 87 for (p = 0; p < N_q; ++p) { 88 for (d = 0; d < dim; ++d) { 89 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p*dim+d])); 90 } 91 } 92 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count)); 93 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 94 " /* Quadrature weights\n" 95 " - (v1,v2,...) */\n" 96 " const %s weights[%d] = {\n", 97 &count, numeric_str, N_q)); 98 for (p = 0; p < N_q; ++p) { 99 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p])); 100 } 101 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count)); 102 /* Basis Functions */ 103 PetscCall(PetscFEGetCellTabulation(fem, 1, &T)); 104 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 105 " /* Nodal basis function evaluations\n" 106 " - basis component is fastest varying, the basis function, then point */\n" 107 " const %s Basis[%d] = {\n", 108 &count, numeric_str, N_q*N_b*N_c)); 109 for (p = 0; p < N_q; ++p) { 110 for (b = 0; b < N_b; ++b) { 111 for (c = 0; c < N_c; ++c) { 112 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p*N_b + b)*N_c + c])); 113 } 114 } 115 } 116 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count)); 117 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 118 "\n" 119 " /* Nodal basis function derivative evaluations,\n" 120 " - derivative direction is fastest varying, then basis component, then basis function, then point */\n" 121 " const %s%d BasisDerivatives[%d] = {\n", 122 &count, numeric_str, dim, N_q*N_b*N_c)); 123 for (p = 0; p < N_q; ++p) { 124 for (b = 0; b < N_b; ++b) { 125 for (c = 0; c < N_c; ++c) { 126 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim)); 127 for (d = 0; d < dim; ++d) { 128 if (d > 0) { 129 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c])); 130 } else { 131 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c])); 132 } 133 } 134 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count)); 135 } 136 } 137 } 138 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count)); 139 /* Sizes */ 140 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 141 " const int dim = %d; // The spatial dimension\n" 142 " const int N_bl = %d; // The number of concurrent blocks\n" 143 " const int N_b = %d; // The number of basis functions\n" 144 " const int N_comp = %d; // The number of basis function components\n" 145 " const int N_bt = N_b*N_comp; // The total number of scalar basis functions\n" 146 " const int N_q = %d; // The number of quadrature points\n" 147 " const int N_bst = N_bt*N_q; // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n" 148 " const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl\n" 149 " const int N_bc = N_t/N_comp; // The number of cells per batch (N_b*N_q*N_bl)\n" 150 " const int N_sbc = N_bst / (N_q * N_comp);\n" 151 " const int N_sqc = N_bst / N_bt;\n" 152 " /*const int N_c = N_cb * N_bc;*/\n" 153 "\n" 154 " /* Calculated indices */\n" 155 " /*const int tidx = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n" 156 " const int tidx = get_local_id(0);\n" 157 " const int blidx = tidx / N_bst; // Block number for this thread\n" 158 " const int bidx = tidx %% N_bt; // Basis function mapped to this thread\n" 159 " const int cidx = tidx %% N_comp; // Basis component mapped to this thread\n" 160 " const int qidx = tidx %% N_q; // Quadrature point mapped to this thread\n" 161 " const int blbidx = tidx %% N_q + blidx*N_q; // Cell mapped to this thread in the basis phase\n" 162 " const int blqidx = tidx %% N_b + blidx*N_b; // Cell mapped to this thread in the quadrature phase\n" 163 " const int gidx = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n" 164 " const int Goffset = gidx*N_cb*N_bc;\n", 165 &count, dim, N_bl, N_b, N_c, N_q)); 166 /* Local memory */ 167 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 168 "\n" 169 " /* Quadrature data */\n" 170 " %s w; // $w_q$, Quadrature weight at $x_q$\n" 171 " __local %s phi_i[%d]; //[N_bt*N_q]; // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n" 172 " __local %s%d phiDer_i[%d]; //[N_bt*N_q]; // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n" 173 " /* Geometric data */\n" 174 " __local %s detJ[%d]; //[N_t]; // $|J(x_q)|$, Jacobian determinant at $x_q$\n" 175 " __local %s invJ[%d];//[N_t*dim*dim]; // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n", 176 &count, numeric_str, numeric_str, N_b*N_c*N_q, numeric_str, dim, N_b*N_c*N_q, numeric_str, N_t, 177 numeric_str, N_t*dim*dim, numeric_str, N_t*N_b*N_c)); 178 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 179 " /* FEM data */\n" 180 " __local %s u_i[%d]; //[N_t*N_bt]; // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n", 181 &count, numeric_str, N_t*N_b*N_c)); 182 if (useAux) { 183 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 184 " __local %s a_i[%d]; //[N_t]; // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n", 185 &count, numeric_str, N_t)); 186 } 187 if (useF0) { 188 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 189 " /* Intermediate calculations */\n" 190 " __local %s f_0[%d]; //[N_t*N_sqc]; // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", 191 &count, numeric_str, N_t*N_q)); 192 } 193 if (useF1) { 194 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 195 " __local %s%d f_1[%d]; //[N_t*N_sqc]; // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", 196 &count, numeric_str, dim, N_t*N_q)); 197 } 198 /* TODO: If using elasticity, put in mu/lambda coefficients */ 199 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 200 " /* Output data */\n" 201 " %s e_i; // Coefficient $e_i$ of the residual\n\n", 202 &count, numeric_str)); 203 /* One-time loads */ 204 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 205 " /* These should be generated inline */\n" 206 " /* Load quadrature weights */\n" 207 " w = weights[qidx];\n" 208 " /* Load basis tabulation \\phi_i for this cell */\n" 209 " if (tidx < N_bt*N_q) {\n" 210 " phi_i[tidx] = Basis[tidx];\n" 211 " phiDer_i[tidx] = BasisDerivatives[tidx];\n" 212 " }\n\n", 213 &count)); 214 /* Batch loads */ 215 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 216 " for (int batch = 0; batch < N_cb; ++batch) {\n" 217 " /* Load geometry */\n" 218 " detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n" 219 " for (int n = 0; n < dim*dim; ++n) {\n" 220 " const int offset = n*N_t;\n" 221 " invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n" 222 " }\n" 223 " /* Load coefficients u_i for this cell */\n" 224 " for (int n = 0; n < N_bt; ++n) {\n" 225 " const int offset = n*N_t;\n" 226 " u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n" 227 " }\n", 228 &count)); 229 if (useAux) { 230 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 231 " /* Load coefficients a_i for this cell */\n" 232 " /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n" 233 " a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n", 234 &count)); 235 } 236 /* Quadrature phase */ 237 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 238 " barrier(CLK_LOCAL_MEM_FENCE);\n" 239 "\n" 240 " /* Map coefficients to values at quadrature points */\n" 241 " for (int c = 0; c < N_sqc; ++c) {\n" 242 " const int cell = c*N_bl*N_b + blqidx;\n" 243 " const int fidx = (cell*N_q + qidx)*N_comp + cidx;\n", 244 &count)); 245 if (useField) { 246 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 247 " %s u[%d]; //[N_comp]; // $u(x_q)$, Value of the field at $x_q$\n", 248 &count, numeric_str, N_c)); 249 } 250 if (useFieldDer) { 251 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 252 " %s%d gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n", 253 &count, numeric_str, dim, N_c)); 254 } 255 if (useFieldAux) { 256 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 257 " %s a[%d]; //[1]; // $a(x_q)$, Value of the auxiliary fields at $x_q$\n", 258 &count, numeric_str, 1)); 259 } 260 if (useFieldDerAux) { 261 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 262 " %s%d gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n", 263 &count, numeric_str, dim, 1)); 264 } 265 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 266 "\n" 267 " for (int comp = 0; comp < N_comp; ++comp) {\n", 268 &count)); 269 if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " u[comp] = 0.0;\n", &count)); 270 if (useFieldDer) { 271 switch (dim) { 272 case 1: 273 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0;\n", &count));break; 274 case 2: 275 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count));break; 276 case 3: 277 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count));break; 278 } 279 } 280 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 281 " }\n", 282 &count)); 283 if (useFieldAux) { 284 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " a[0] = 0.0;\n", &count)); 285 } 286 if (useFieldDerAux) { 287 switch (dim) { 288 case 1: 289 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0;\n", &count));break; 290 case 2: 291 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count));break; 292 case 3: 293 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count));break; 294 } 295 } 296 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 297 " /* Get field and derivatives at this quadrature point */\n" 298 " for (int i = 0; i < N_b; ++i) {\n" 299 " for (int comp = 0; comp < N_comp; ++comp) {\n" 300 " const int b = i*N_comp+comp;\n" 301 " const int pidx = qidx*N_bt + b;\n" 302 " const int uidx = cell*N_bt + b;\n" 303 " %s%d realSpaceDer;\n\n", 304 &count, numeric_str, dim)); 305 if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," u[comp] += u_i[uidx]*phi_i[pidx];\n", &count)); 306 if (useFieldDer) { 307 switch (dim) { 308 case 2: 309 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 310 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n" 311 " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n" 312 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n" 313 " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n", 314 &count));break; 315 case 3: 316 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 317 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n" 318 " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n" 319 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n" 320 " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n" 321 " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n" 322 " gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n", 323 &count));break; 324 } 325 } 326 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 327 " }\n" 328 " }\n", 329 &count)); 330 if (useFieldAux) { 331 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," a[0] += a_i[cell];\n", &count)); 332 } 333 /* Calculate residual at quadrature points: Should be generated by an weak form egine */ 334 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 335 " /* Process values at quadrature points */\n", 336 &count)); 337 switch (op) { 338 case LAPLACIAN: 339 if (useF0) {PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count));} 340 if (useF1) { 341 if (useAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = a[0]*gradU[cidx];\n", &count)); 342 else PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = gradU[cidx];\n", &count)); 343 } 344 break; 345 case ELASTICITY: 346 if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count)); 347 if (useF1) { 348 switch (dim) { 349 case 2: 350 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 351 " switch (cidx) {\n" 352 " case 0:\n" 353 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n" 354 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n" 355 " break;\n" 356 " case 1:\n" 357 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n" 358 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n" 359 " }\n", 360 &count));break; 361 case 3: 362 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 363 " switch (cidx) {\n" 364 " case 0:\n" 365 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n" 366 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n" 367 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n" 368 " break;\n" 369 " case 1:\n" 370 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n" 371 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n" 372 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n" 373 " break;\n" 374 " case 2:\n" 375 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n" 376 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n" 377 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n" 378 " }\n", 379 &count));break; 380 }} 381 break; 382 default: 383 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PDE operator %d is not supported", op); 384 } 385 if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_0[fidx] *= detJ[cell]*w;\n", &count)); 386 if (useF1) { 387 switch (dim) { 388 case 1: 389 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w;\n", &count));break; 390 case 2: 391 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count));break; 392 case 3: 393 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count));break; 394 } 395 } 396 /* Thread transpose */ 397 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 398 " }\n\n" 399 " /* ==== TRANSPOSE THREADS ==== */\n" 400 " barrier(CLK_LOCAL_MEM_FENCE);\n\n", 401 &count)); 402 /* Basis phase */ 403 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 404 " /* Map values at quadrature points to coefficients */\n" 405 " for (int c = 0; c < N_sbc; ++c) {\n" 406 " const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n" 407 "\n" 408 " e_i = 0.0;\n" 409 " for (int q = 0; q < N_q; ++q) {\n" 410 " const int pidx = q*N_bt + bidx;\n" 411 " const int fidx = (cell*N_q + q)*N_comp + cidx;\n" 412 " %s%d realSpaceDer;\n\n", 413 &count, numeric_str, dim)); 414 415 if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," e_i += phi_i[pidx]*f_0[fidx];\n", &count)); 416 if (useF1) { 417 switch (dim) { 418 case 2: 419 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 420 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n" 421 " e_i += realSpaceDer.x*f_1[fidx].x;\n" 422 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n" 423 " e_i += realSpaceDer.y*f_1[fidx].y;\n", 424 &count));break; 425 case 3: 426 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 427 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n" 428 " e_i += realSpaceDer.x*f_1[fidx].x;\n" 429 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n" 430 " e_i += realSpaceDer.y*f_1[fidx].y;\n" 431 " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n" 432 " e_i += realSpaceDer.z*f_1[fidx].z;\n", 433 &count));break; 434 } 435 } 436 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 437 " }\n" 438 " /* Write element vector for N_{cbc} cells at a time */\n" 439 " elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n" 440 " }\n" 441 " /* ==== Could do one write per batch ==== */\n" 442 " }\n" 443 " return;\n" 444 "}\n", 445 &count)); 446 PetscFunctionReturn(0); 447 } 448 449 static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel) 450 { 451 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 452 PetscInt dim, N_bl; 453 PetscBool flg; 454 char *buffer; 455 size_t len; 456 char errMsg[8192]; 457 cl_int err; 458 PetscErrorCode ierr; 459 460 PetscFunctionBegin; 461 PetscCall(PetscFEGetSpatialDimension(fem, &dim)); 462 PetscCall(PetscMalloc1(8192, &buffer)); 463 PetscCall(PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL)); 464 PetscCall(PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl)); 465 PetscCall(PetscOptionsHasName(((PetscObject)fem)->options,((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg)); 466 if (flg) PetscCall(PetscPrintf(PetscObjectComm((PetscObject) fem), "OpenCL FE Integration Kernel:\n%s\n", buffer)); 467 PetscCall(PetscStrlen(buffer,&len)); 468 *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **) &buffer, &len, &err);PetscCall(err); 469 err = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL); 470 if (err != CL_SUCCESS) { 471 err = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192*sizeof(char), &errMsg, NULL); 472 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg); 473 } 474 PetscCall(PetscFree(buffer)); 475 *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &ierr); 476 PetscFunctionReturn(0); 477 } 478 479 static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z) 480 { 481 const PetscInt Nblocks = N/blockSize; 482 483 PetscFunctionBegin; 484 PetscCheck(!(N % blockSize),PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N); 485 *z = 1; 486 *y = 1; 487 for (*x = (size_t) (PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) { 488 *y = Nblocks / *x; 489 if (*x * *y == (size_t)Nblocks) break; 490 } 491 PetscCheck(*x * *y == (size_t)Nblocks,PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %D with block size %D", N, blockSize); 492 PetscFunctionReturn(0); 493 } 494 495 static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops) 496 { 497 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 498 PetscStageLog stageLog; 499 PetscEventPerfLog eventLog = NULL; 500 int stage; 501 502 PetscFunctionBegin; 503 PetscCall(PetscLogGetStageLog(&stageLog)); 504 PetscCall(PetscStageLogGetCurrent(stageLog, &stage)); 505 PetscCall(PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog)); 506 /* Log performance info */ 507 eventLog->eventInfo[ocl->residualEvent].count++; 508 eventLog->eventInfo[ocl->residualEvent].time += time; 509 eventLog->eventInfo[ocl->residualEvent].flops += flops; 510 PetscFunctionReturn(0); 511 } 512 513 static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscFormKey key, PetscInt Ne, PetscFEGeom *cgeom, 514 const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[]) 515 { 516 /* Nbc = batchSize */ 517 PetscFE fem; 518 PetscFE_OpenCL *ocl; 519 PetscPointFunc f0_func; 520 PetscPointFunc f1_func; 521 PetscQuadrature q; 522 PetscInt dim, qNc; 523 PetscInt N_b; /* The number of basis functions */ 524 PetscInt N_comp; /* The number of basis function components */ 525 PetscInt N_bt; /* The total number of scalar basis functions */ 526 PetscInt N_q; /* The number of quadrature points */ 527 PetscInt N_bst; /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */ 528 PetscInt N_t; /* The number of threads, N_bst * N_bl */ 529 PetscInt N_bl; /* The number of blocks */ 530 PetscInt N_bc; /* The batch size, N_bl*N_q*N_b */ 531 PetscInt N_cb; /* The number of batches */ 532 const PetscInt field = key.field; 533 PetscInt numFlops, f0Flops = 0, f1Flops = 0; 534 PetscBool useAux = probAux ? PETSC_TRUE : PETSC_FALSE; 535 PetscBool useField = PETSC_FALSE; 536 PetscBool useFieldDer = PETSC_TRUE; 537 PetscBool useF0 = PETSC_TRUE; 538 PetscBool useF1 = PETSC_TRUE; 539 /* OpenCL variables */ 540 cl_program ocl_prog; 541 cl_kernel ocl_kernel; 542 cl_event ocl_ev; /* The event for tracking kernel execution */ 543 cl_ulong ns_start; /* Nanoseconds counter on GPU at kernel start */ 544 cl_ulong ns_end; /* Nanoseconds counter on GPU at kernel stop */ 545 cl_mem o_jacobianInverses, o_jacobianDeterminants; 546 cl_mem o_coefficients, o_coefficientsAux, o_elemVec; 547 float *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL; 548 double *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL; 549 PetscReal *r_invJ = NULL, *r_detJ = NULL; 550 void *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ; 551 size_t local_work_size[3], global_work_size[3]; 552 size_t realSize, x, y, z; 553 const PetscReal *points, *weights; 554 PetscErrorCode ierr; 555 556 PetscFunctionBegin; 557 PetscCall(PetscDSGetDiscretization(prob, field, (PetscObject *) &fem)); 558 ocl = (PetscFE_OpenCL *) fem->data; 559 if (!Ne) {PetscCall(PetscFEOpenCLLogResidual(fem, 0.0, 0.0)); PetscFunctionReturn(0);} 560 PetscCall(PetscFEGetSpatialDimension(fem, &dim)); 561 PetscCall(PetscFEGetQuadrature(fem, &q)); 562 PetscCall(PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights)); 563 PetscCheck(qNc == 1,PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components", qNc); 564 PetscCall(PetscFEGetDimension(fem, &N_b)); 565 PetscCall(PetscFEGetNumComponents(fem, &N_comp)); 566 PetscCall(PetscDSGetResidual(prob, field, &f0_func, &f1_func)); 567 PetscCall(PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb)); 568 N_bt = N_b*N_comp; 569 N_bst = N_bt*N_q; 570 N_t = N_bst*N_bl; 571 PetscCheck(N_bc*N_comp == N_t,PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp); 572 /* Calculate layout */ 573 if (Ne % (N_cb*N_bc)) { /* Remainder cells */ 574 PetscCall(PetscFEIntegrateResidual_Basic(prob, key, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec)); 575 PetscFunctionReturn(0); 576 } 577 PetscCall(PetscFEOpenCLCalculateGrid(fem, Ne, N_cb*N_bc, &x, &y, &z)); 578 local_work_size[0] = N_bc*N_comp; 579 local_work_size[1] = 1; 580 local_work_size[2] = 1; 581 global_work_size[0] = x * local_work_size[0]; 582 global_work_size[1] = y * local_work_size[1]; 583 global_work_size[2] = z * local_work_size[2]; 584 PetscCall(PetscInfo(fem, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb)); 585 PetscCall(PetscInfo(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb)); 586 /* Generate code */ 587 if (probAux) { 588 PetscSpace P; 589 PetscInt NfAux, order, f; 590 591 PetscCall(PetscDSGetNumFields(probAux, &NfAux)); 592 for (f = 0; f < NfAux; ++f) { 593 PetscFE feAux; 594 595 PetscCall(PetscDSGetDiscretization(probAux, f, (PetscObject *) &feAux)); 596 PetscCall(PetscFEGetBasisSpace(feAux, &P)); 597 PetscCall(PetscSpaceGetDegree(P, &order, NULL)); 598 PetscCheck(order <= 0,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields"); 599 } 600 } 601 PetscCall(PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel)); 602 /* Create buffers on the device and send data over */ 603 PetscCall(PetscDataTypeGetSize(ocl->realType, &realSize)); 604 PetscCheck(cgeom->numPoints <= 1,PETSC_COMM_SELF, PETSC_ERR_SUP, "Only support affine geometry for OpenCL integration right now"); 605 if (sizeof(PetscReal) != realSize) { 606 switch (ocl->realType) { 607 case PETSC_FLOAT: 608 { 609 PetscInt c, b, d; 610 611 PetscCall(PetscMalloc4(Ne*N_bt,&f_coeff,Ne,&f_coeffAux,Ne*dim*dim,&f_invJ,Ne,&f_detJ)); 612 for (c = 0; c < Ne; ++c) { 613 f_detJ[c] = (float) cgeom->detJ[c]; 614 for (d = 0; d < dim*dim; ++d) { 615 f_invJ[c*dim*dim+d] = (float) cgeom->invJ[c * dim * dim + d]; 616 } 617 for (b = 0; b < N_bt; ++b) { 618 f_coeff[c*N_bt+b] = (float) coefficients[c*N_bt+b]; 619 } 620 } 621 if (coefficientsAux) { /* Assume P0 */ 622 for (c = 0; c < Ne; ++c) { 623 f_coeffAux[c] = (float) coefficientsAux[c]; 624 } 625 } 626 oclCoeff = (void *) f_coeff; 627 if (coefficientsAux) { 628 oclCoeffAux = (void *) f_coeffAux; 629 } else { 630 oclCoeffAux = NULL; 631 } 632 oclInvJ = (void *) f_invJ; 633 oclDetJ = (void *) f_detJ; 634 } 635 break; 636 case PETSC_DOUBLE: 637 { 638 PetscInt c, b, d; 639 640 PetscCall(PetscMalloc4(Ne*N_bt,&d_coeff,Ne,&d_coeffAux,Ne*dim*dim,&d_invJ,Ne,&d_detJ)); 641 for (c = 0; c < Ne; ++c) { 642 d_detJ[c] = (double) cgeom->detJ[c]; 643 for (d = 0; d < dim*dim; ++d) { 644 d_invJ[c*dim*dim+d] = (double) cgeom->invJ[c * dim * dim + d]; 645 } 646 for (b = 0; b < N_bt; ++b) { 647 d_coeff[c*N_bt+b] = (double) coefficients[c*N_bt+b]; 648 } 649 } 650 if (coefficientsAux) { /* Assume P0 */ 651 for (c = 0; c < Ne; ++c) { 652 d_coeffAux[c] = (double) coefficientsAux[c]; 653 } 654 } 655 oclCoeff = (void *) d_coeff; 656 if (coefficientsAux) { 657 oclCoeffAux = (void *) d_coeffAux; 658 } else { 659 oclCoeffAux = NULL; 660 } 661 oclInvJ = (void *) d_invJ; 662 oclDetJ = (void *) d_detJ; 663 } 664 break; 665 default: 666 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType); 667 } 668 } else { 669 PetscInt c, d; 670 671 PetscCall(PetscMalloc2(Ne*dim*dim,&r_invJ,Ne,&r_detJ)); 672 for (c = 0; c < Ne; ++c) { 673 r_detJ[c] = cgeom->detJ[c]; 674 for (d = 0; d < dim*dim; ++d) { 675 r_invJ[c*dim*dim+d] = cgeom->invJ[c * dim * dim + d]; 676 } 677 } 678 oclCoeff = (void *) coefficients; 679 oclCoeffAux = (void *) coefficientsAux; 680 oclInvJ = (void *) r_invJ; 681 oclDetJ = (void *) r_detJ; 682 } 683 o_coefficients = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*N_bt * realSize, oclCoeff, &ierr); 684 if (coefficientsAux) { 685 o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclCoeffAux, &ierr); 686 } else { 687 o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY, Ne * realSize, oclCoeffAux, &ierr); 688 } 689 o_jacobianInverses = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * realSize, oclInvJ, &ierr); 690 o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclDetJ, &ierr); 691 o_elemVec = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY, Ne*N_bt * realSize, NULL, &ierr); 692 /* Kernel launch */ 693 PetscCall(clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*) &N_cb)); 694 PetscCall(clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*) &o_coefficients)); 695 PetscCall(clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*) &o_coefficientsAux)); 696 PetscCall(clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*) &o_jacobianInverses)); 697 PetscCall(clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*) &o_jacobianDeterminants)); 698 PetscCall(clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void*) &o_elemVec)); 699 PetscCall(clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev)); 700 /* Read data back from device */ 701 if (sizeof(PetscReal) != realSize) { 702 switch (ocl->realType) { 703 case PETSC_FLOAT: 704 { 705 float *elem; 706 PetscInt c, b; 707 708 PetscCall(PetscFree4(f_coeff,f_coeffAux,f_invJ,f_detJ)); 709 PetscCall(PetscMalloc1(Ne*N_bt, &elem)); 710 PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL)); 711 for (c = 0; c < Ne; ++c) { 712 for (b = 0; b < N_bt; ++b) { 713 elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b]; 714 } 715 } 716 PetscCall(PetscFree(elem)); 717 } 718 break; 719 case PETSC_DOUBLE: 720 { 721 double *elem; 722 PetscInt c, b; 723 724 PetscCall(PetscFree4(d_coeff,d_coeffAux,d_invJ,d_detJ)); 725 PetscCall(PetscMalloc1(Ne*N_bt, &elem)); 726 PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL)); 727 for (c = 0; c < Ne; ++c) { 728 for (b = 0; b < N_bt; ++b) { 729 elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b]; 730 } 731 } 732 PetscCall(PetscFree(elem)); 733 } 734 break; 735 default: 736 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType); 737 } 738 } else { 739 PetscCall(PetscFree2(r_invJ,r_detJ)); 740 PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elemVec, 0, NULL, NULL)); 741 } 742 /* Log performance */ 743 PetscCall(clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL)); 744 PetscCall(clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL)); 745 f0Flops = 0; 746 switch (ocl->op) { 747 case LAPLACIAN: 748 f1Flops = useAux ? dim : 0;break; 749 case ELASTICITY: 750 f1Flops = 2*dim*dim;break; 751 } 752 numFlops = Ne*( 753 N_q*( 754 N_b*N_comp*((useField ? 2 : 0) + (useFieldDer ? 2*dim*(dim + 1) : 0)) 755 /*+ 756 N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/ 757 + 758 N_comp*((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2*dim : 0))) 759 + 760 N_b*((useF0 ? 2 : 0) + (useF1 ? 2*dim*(dim + 1) : 0))); 761 PetscCall(PetscFEOpenCLLogResidual(fem, (ns_end - ns_start)*1.0e-9, numFlops)); 762 /* Cleanup */ 763 PetscCall(clReleaseMemObject(o_coefficients)); 764 PetscCall(clReleaseMemObject(o_coefficientsAux)); 765 PetscCall(clReleaseMemObject(o_jacobianInverses)); 766 PetscCall(clReleaseMemObject(o_jacobianDeterminants)); 767 PetscCall(clReleaseMemObject(o_elemVec)); 768 PetscCall(clReleaseKernel(ocl_kernel)); 769 PetscCall(clReleaseProgram(ocl_prog)); 770 PetscFunctionReturn(0); 771 } 772 773 PETSC_INTERN PetscErrorCode PetscFESetUp_Basic(PetscFE); 774 PETSC_INTERN PetscErrorCode PetscFECreateTabulation_Basic(PetscFE, PetscInt, const PetscReal [], PetscInt, PetscTabulation); 775 776 static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem) 777 { 778 PetscFunctionBegin; 779 fem->ops->setfromoptions = NULL; 780 fem->ops->setup = PetscFESetUp_Basic; 781 fem->ops->view = NULL; 782 fem->ops->destroy = PetscFEDestroy_OpenCL; 783 fem->ops->getdimension = PetscFEGetDimension_Basic; 784 fem->ops->createtabulation = PetscFECreateTabulation_Basic; 785 fem->ops->integrateresidual = PetscFEIntegrateResidual_OpenCL; 786 fem->ops->integratebdresidual = NULL/* PetscFEIntegrateBdResidual_OpenCL */; 787 fem->ops->integratejacobianaction = NULL/* PetscFEIntegrateJacobianAction_OpenCL */; 788 fem->ops->integratejacobian = PetscFEIntegrateJacobian_Basic; 789 PetscFunctionReturn(0); 790 } 791 792 /*MC 793 PETSCFEOPENCL = "opencl" - A PetscFE object that integrates using a vectorized OpenCL implementation 794 795 Level: intermediate 796 797 .seealso: PetscFEType, PetscFECreate(), PetscFESetType() 798 M*/ 799 800 PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem) 801 { 802 PetscFE_OpenCL *ocl; 803 cl_uint num_platforms; 804 cl_platform_id platform_ids[42]; 805 cl_uint num_devices; 806 cl_device_id device_ids[42]; 807 cl_int err; 808 809 PetscFunctionBegin; 810 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 811 PetscCall(PetscNewLog(fem,&ocl)); 812 fem->data = ocl; 813 814 /* Init Platform */ 815 PetscCall(clGetPlatformIDs(42, platform_ids, &num_platforms)); 816 PetscCheck(num_platforms,PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL platform found."); 817 ocl->pf_id = platform_ids[0]; 818 /* Init Device */ 819 PetscCall(clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices)); 820 PetscCheck(num_devices,PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL device found."); 821 ocl->dev_id = device_ids[0]; 822 /* Create context with one command queue */ 823 ocl->ctx_id = clCreateContext(0, 1, &(ocl->dev_id), NULL, NULL, &err);PetscCall(err); 824 ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &err);PetscCall(err); 825 /* Types */ 826 ocl->realType = PETSC_FLOAT; 827 /* Register events */ 828 PetscCall(PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent)); 829 /* Equation handling */ 830 ocl->op = LAPLACIAN; 831 832 PetscCall(PetscFEInitialize_OpenCL(fem)); 833 PetscFunctionReturn(0); 834 } 835 836 /*@ 837 PetscFEOpenCLSetRealType - Set the scalar type for running on the accelerator 838 839 Input Parameters: 840 + fem - The PetscFE 841 - realType - The scalar type 842 843 Level: developer 844 845 .seealso: PetscFEOpenCLGetRealType() 846 @*/ 847 PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType) 848 { 849 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 850 851 PetscFunctionBegin; 852 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 853 ocl->realType = realType; 854 PetscFunctionReturn(0); 855 } 856 857 /*@ 858 PetscFEOpenCLGetRealType - Get the scalar type for running on the accelerator 859 860 Input Parameter: 861 . fem - The PetscFE 862 863 Output Parameter: 864 . realType - The scalar type 865 866 Level: developer 867 868 .seealso: PetscFEOpenCLSetRealType() 869 @*/ 870 PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType) 871 { 872 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 873 874 PetscFunctionBegin; 875 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 876 PetscValidPointer(realType, 2); 877 *realType = ocl->realType; 878 PetscFunctionReturn(0); 879 } 880 881 #endif /* PETSC_HAVE_OPENCL */ 882