1 #include <petsc/private/petscfeimpl.h> /*I "petscfe.h" I*/
2 #include <petsc/private/loghandlerimpl.h>
3 #include <../src/sys/logging/handler/impls/default/logdefault.h>
4
5 #if defined(PETSC_HAVE_OPENCL)
6
PetscFEDestroy_OpenCL(PetscFE fem)7 static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem)
8 {
9 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
10
11 PetscFunctionBegin;
12 PetscCall(clReleaseCommandQueue(ocl->queue_id));
13 ocl->queue_id = 0;
14 PetscCall(clReleaseContext(ocl->ctx_id));
15 ocl->ctx_id = 0;
16 PetscCall(PetscFree(ocl));
17 PetscFunctionReturn(PETSC_SUCCESS);
18 }
19
20 #define PetscCallSTR(err) \
21 do { \
22 PetscCall(err); \
23 string_tail += count; \
24 PetscCheck(string_tail != end_of_buffer, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Buffer overflow"); \
25 } while (0)
26 enum {
27 LAPLACIAN = 0,
28 ELASTICITY = 1
29 };
30
31 /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */
32 /* dim Number of spatial dimensions: 2 */
33 /* N_b Number of basis functions: generated */
34 /* N_{bt} Number of total basis functions: N_b * N_{comp} */
35 /* N_q Number of quadrature points: generated */
36 /* N_{bs} Number of block cells LCM(N_b, N_q) */
37 /* N_{bst} Number of block cell components LCM(N_{bt}, N_q) */
38 /* N_{bl} Number of concurrent blocks generated */
39 /* N_t Number of threads: N_{bl} * N_{bs} */
40 /* N_{cbc} Number of concurrent basis cells: N_{bl} * N_q */
41 /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b */
42 /* N_{sbc} Number of serial basis cells: N_{bs} / N_q */
43 /* N_{sqc} Number of serial quadrature cells: N_{bs} / N_b */
44 /* N_{cb} Number of serial cell batches: input */
45 /* N_c Number of total cells: N_{cb}*N_{t}/N_{comp} */
PetscFEOpenCLGenerateIntegrationCode(PetscFE fem,char ** string_buffer,PetscInt buffer_length,PetscBool useAux,PetscInt N_bl)46 static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl)
47 {
48 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
49 PetscQuadrature q;
50 char *string_tail = *string_buffer;
51 char *end_of_buffer = *string_buffer + buffer_length;
52 char float_str[] = "float", double_str[] = "double";
53 char *numeric_str = &float_str[0];
54 PetscInt op = ocl->op;
55 PetscBool useField = PETSC_FALSE;
56 PetscBool useFieldDer = PETSC_TRUE;
57 PetscBool useFieldAux = useAux;
58 PetscBool useFieldDerAux = PETSC_FALSE;
59 PetscBool useF0 = PETSC_TRUE;
60 PetscBool useF1 = PETSC_TRUE;
61 const PetscReal *points, *weights;
62 PetscTabulation T;
63 PetscInt dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c;
64 size_t count;
65
66 PetscFunctionBegin;
67 PetscCall(PetscFEGetSpatialDimension(fem, &dim));
68 PetscCall(PetscFEGetDimension(fem, &N_b));
69 PetscCall(PetscFEGetNumComponents(fem, &N_c));
70 PetscCall(PetscFEGetQuadrature(fem, &q));
71 PetscCall(PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights));
72 PetscCheck(qNc == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %" PetscInt_FMT " components", qNc);
73 N_t = N_b * N_c * N_q * N_bl;
74 /* Enable device extension for double precision */
75 if (ocl->realType == PETSC_DOUBLE) {
76 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
77 "#if defined(cl_khr_fp64)\n"
78 "# pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
79 "#elif defined(cl_amd_fp64)\n"
80 "# pragma OPENCL EXTENSION cl_amd_fp64: enable\n"
81 "#endif\n",
82 &count));
83 numeric_str = &double_str[0];
84 }
85 /* Kernel API */
86 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
87 "\n"
88 "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n"
89 "{\n",
90 &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str));
91 /* Quadrature */
92 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
93 " /* Quadrature points\n"
94 " - (x1,y1,x2,y2,...) */\n"
95 " const %s points[%d] = {\n",
96 &count, numeric_str, N_q * dim));
97 for (p = 0; p < N_q; ++p) {
98 for (d = 0; d < dim; ++d) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p * dim + d]));
99 }
100 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
101 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
102 " /* Quadrature weights\n"
103 " - (v1,v2,...) */\n"
104 " const %s weights[%d] = {\n",
105 &count, numeric_str, N_q));
106 for (p = 0; p < N_q; ++p) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p]));
107 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
108 /* Basis Functions */
109 PetscCall(PetscFEGetCellTabulation(fem, 1, &T));
110 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
111 " /* Nodal basis function evaluations\n"
112 " - basis component is fastest varying, the basis function, then point */\n"
113 " const %s Basis[%d] = {\n",
114 &count, numeric_str, N_q * N_b * N_c));
115 for (p = 0; p < N_q; ++p) {
116 for (b = 0; b < N_b; ++b) {
117 for (c = 0; c < N_c; ++c) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p * N_b + b) * N_c + c]));
118 }
119 }
120 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
121 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
122 "\n"
123 " /* Nodal basis function derivative evaluations,\n"
124 " - derivative direction is fastest varying, then basis component, then basis function, then point */\n"
125 " const %s%d BasisDerivatives[%d] = {\n",
126 &count, numeric_str, dim, N_q * N_b * N_c));
127 for (p = 0; p < N_q; ++p) {
128 for (b = 0; b < N_b; ++b) {
129 for (c = 0; c < N_c; ++c) {
130 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim));
131 for (d = 0; d < dim; ++d) {
132 if (d > 0) {
133 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p * N_b + b) * dim + d) * N_c + c]));
134 } else {
135 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p * N_b + b) * dim + d) * N_c + c]));
136 }
137 }
138 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count));
139 }
140 }
141 }
142 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
143 /* Sizes */
144 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
145 " const int dim = %d; // The spatial dimension\n"
146 " const int N_bl = %d; // The number of concurrent blocks\n"
147 " const int N_b = %d; // The number of basis functions\n"
148 " const int N_comp = %d; // The number of basis function components\n"
149 " const int N_bt = N_b*N_comp; // The total number of scalar basis functions\n"
150 " const int N_q = %d; // The number of quadrature points\n"
151 " const int N_bst = N_bt*N_q; // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n"
152 " const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl\n"
153 " const int N_bc = N_t/N_comp; // The number of cells per batch (N_b*N_q*N_bl)\n"
154 " const int N_sbc = N_bst / (N_q * N_comp);\n"
155 " const int N_sqc = N_bst / N_bt;\n"
156 " /*const int N_c = N_cb * N_bc;*/\n"
157 "\n"
158 " /* Calculated indices */\n"
159 " /*const int tidx = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n"
160 " const int tidx = get_local_id(0);\n"
161 " const int blidx = tidx / N_bst; // Block number for this thread\n"
162 " const int bidx = tidx %% N_bt; // Basis function mapped to this thread\n"
163 " const int cidx = tidx %% N_comp; // Basis component mapped to this thread\n"
164 " const int qidx = tidx %% N_q; // Quadrature point mapped to this thread\n"
165 " const int blbidx = tidx %% N_q + blidx*N_q; // Cell mapped to this thread in the basis phase\n"
166 " const int blqidx = tidx %% N_b + blidx*N_b; // Cell mapped to this thread in the quadrature phase\n"
167 " const int gidx = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n"
168 " const int Goffset = gidx*N_cb*N_bc;\n",
169 &count, dim, N_bl, N_b, N_c, N_q));
170 /* Local memory */
171 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
172 "\n"
173 " /* Quadrature data */\n"
174 " %s w; // $w_q$, Quadrature weight at $x_q$\n"
175 " __local %s phi_i[%d]; //[N_bt*N_q]; // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n"
176 " __local %s%d phiDer_i[%d]; //[N_bt*N_q]; // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n"
177 " /* Geometric data */\n"
178 " __local %s detJ[%d]; //[N_t]; // $|J(x_q)|$, Jacobian determinant at $x_q$\n"
179 " __local %s invJ[%d];//[N_t*dim*dim]; // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n",
180 &count, numeric_str, numeric_str, N_b * N_c * N_q, numeric_str, dim, N_b * N_c * N_q, numeric_str, N_t, numeric_str, N_t * dim * dim));
181 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
182 " /* FEM data */\n"
183 " __local %s u_i[%d]; //[N_t*N_bt]; // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n",
184 &count, numeric_str, N_t * N_b * N_c));
185 if (useAux) {
186 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " __local %s a_i[%d]; //[N_t]; // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n", &count, numeric_str, N_t));
187 }
188 if (useF0) {
189 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
190 " /* Intermediate calculations */\n"
191 " __local %s f_0[%d]; //[N_t*N_sqc]; // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
192 &count, numeric_str, N_t * N_q));
193 }
194 if (useF1) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " __local %s%d f_1[%d]; //[N_t*N_sqc]; // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", &count, numeric_str, dim, N_t * N_q));
195 /* TODO: If using elasticity, put in mu/lambda coefficients */
196 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
197 " /* Output data */\n"
198 " %s e_i; // Coefficient $e_i$ of the residual\n\n",
199 &count, numeric_str));
200 /* One-time loads */
201 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
202 " /* These should be generated inline */\n"
203 " /* Load quadrature weights */\n"
204 " w = weights[qidx];\n"
205 " /* Load basis tabulation \\phi_i for this cell */\n"
206 " if (tidx < N_bt*N_q) {\n"
207 " phi_i[tidx] = Basis[tidx];\n"
208 " phiDer_i[tidx] = BasisDerivatives[tidx];\n"
209 " }\n\n",
210 &count));
211 /* Batch loads */
212 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
213 " for (int batch = 0; batch < N_cb; ++batch) {\n"
214 " /* Load geometry */\n"
215 " detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n"
216 " for (int n = 0; n < dim*dim; ++n) {\n"
217 " const int offset = n*N_t;\n"
218 " invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n"
219 " }\n"
220 " /* Load coefficients u_i for this cell */\n"
221 " for (int n = 0; n < N_bt; ++n) {\n"
222 " const int offset = n*N_t;\n"
223 " u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n"
224 " }\n",
225 &count));
226 if (useAux) {
227 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
228 " /* Load coefficients a_i for this cell */\n"
229 " /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n"
230 " a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n",
231 &count));
232 }
233 /* Quadrature phase */
234 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
235 " barrier(CLK_LOCAL_MEM_FENCE);\n"
236 "\n"
237 " /* Map coefficients to values at quadrature points */\n"
238 " for (int c = 0; c < N_sqc; ++c) {\n"
239 " const int cell = c*N_bl*N_b + blqidx;\n"
240 " const int fidx = (cell*N_q + qidx)*N_comp + cidx;\n",
241 &count));
242 if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s u[%d]; //[N_comp]; // $u(x_q)$, Value of the field at $x_q$\n", &count, numeric_str, N_c));
243 if (useFieldDer) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s%d gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n", &count, numeric_str, dim, N_c));
244 if (useFieldAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s a[%d]; //[1]; // $a(x_q)$, Value of the auxiliary fields at $x_q$\n", &count, numeric_str, 1));
245 if (useFieldDerAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s%d gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n", &count, numeric_str, dim, 1));
246 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
247 "\n"
248 " for (int comp = 0; comp < N_comp; ++comp) {\n",
249 &count));
250 if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " u[comp] = 0.0;\n", &count));
251 if (useFieldDer) {
252 switch (dim) {
253 case 1:
254 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0;\n", &count));
255 break;
256 case 2:
257 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count));
258 break;
259 case 3:
260 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count));
261 break;
262 }
263 }
264 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " }\n", &count));
265 if (useFieldAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " a[0] = 0.0;\n", &count));
266 if (useFieldDerAux) {
267 switch (dim) {
268 case 1:
269 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0;\n", &count));
270 break;
271 case 2:
272 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count));
273 break;
274 case 3:
275 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count));
276 break;
277 }
278 }
279 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
280 " /* Get field and derivatives at this quadrature point */\n"
281 " for (int i = 0; i < N_b; ++i) {\n"
282 " for (int comp = 0; comp < N_comp; ++comp) {\n"
283 " const int b = i*N_comp+comp;\n"
284 " const int pidx = qidx*N_bt + b;\n"
285 " const int uidx = cell*N_bt + b;\n"
286 " %s%d realSpaceDer;\n\n",
287 &count, numeric_str, dim));
288 if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " u[comp] += u_i[uidx]*phi_i[pidx];\n", &count));
289 if (useFieldDer) {
290 switch (dim) {
291 case 2:
292 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
293 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
294 " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
295 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
296 " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n",
297 &count));
298 break;
299 case 3:
300 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
301 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
302 " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
303 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
304 " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n"
305 " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
306 " gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n",
307 &count));
308 break;
309 }
310 }
311 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
312 " }\n"
313 " }\n",
314 &count));
315 if (useFieldAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " a[0] += a_i[cell];\n", &count));
316 /* Calculate residual at quadrature points: Should be generated by an weak form engine */
317 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " /* Process values at quadrature points */\n", &count));
318 switch (op) {
319 case LAPLACIAN:
320 if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count));
321 if (useF1) {
322 if (useAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = a[0]*gradU[cidx];\n", &count));
323 else PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = gradU[cidx];\n", &count));
324 }
325 break;
326 case ELASTICITY:
327 if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count));
328 if (useF1) {
329 switch (dim) {
330 case 2:
331 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
332 " switch (cidx) {\n"
333 " case 0:\n"
334 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n"
335 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n"
336 " break;\n"
337 " case 1:\n"
338 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n"
339 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n"
340 " }\n",
341 &count));
342 break;
343 case 3:
344 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
345 " switch (cidx) {\n"
346 " case 0:\n"
347 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n"
348 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n"
349 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n"
350 " break;\n"
351 " case 1:\n"
352 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n"
353 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n"
354 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n"
355 " break;\n"
356 " case 2:\n"
357 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n"
358 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n"
359 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n"
360 " }\n",
361 &count));
362 break;
363 }
364 }
365 break;
366 default:
367 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PDE operator %d is not supported", op);
368 }
369 if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] *= detJ[cell]*w;\n", &count));
370 if (useF1) {
371 switch (dim) {
372 case 1:
373 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx].x *= detJ[cell]*w;\n", &count));
374 break;
375 case 2:
376 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count));
377 break;
378 case 3:
379 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count));
380 break;
381 }
382 }
383 /* Thread transpose */
384 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
385 " }\n\n"
386 " /* ==== TRANSPOSE THREADS ==== */\n"
387 " barrier(CLK_LOCAL_MEM_FENCE);\n\n",
388 &count));
389 /* Basis phase */
390 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
391 " /* Map values at quadrature points to coefficients */\n"
392 " for (int c = 0; c < N_sbc; ++c) {\n"
393 " const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n"
394 "\n"
395 " e_i = 0.0;\n"
396 " for (int q = 0; q < N_q; ++q) {\n"
397 " const int pidx = q*N_bt + bidx;\n"
398 " const int fidx = (cell*N_q + q)*N_comp + cidx;\n"
399 " %s%d realSpaceDer;\n\n",
400 &count, numeric_str, dim));
401
402 if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " e_i += phi_i[pidx]*f_0[fidx];\n", &count));
403 if (useF1) {
404 switch (dim) {
405 case 2:
406 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
407 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
408 " e_i += realSpaceDer.x*f_1[fidx].x;\n"
409 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
410 " e_i += realSpaceDer.y*f_1[fidx].y;\n",
411 &count));
412 break;
413 case 3:
414 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
415 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
416 " e_i += realSpaceDer.x*f_1[fidx].x;\n"
417 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
418 " e_i += realSpaceDer.y*f_1[fidx].y;\n"
419 " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
420 " e_i += realSpaceDer.z*f_1[fidx].z;\n",
421 &count));
422 break;
423 }
424 }
425 PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
426 " }\n"
427 " /* Write element vector for N_{cbc} cells at a time */\n"
428 " elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n"
429 " }\n"
430 " /* ==== Could do one write per batch ==== */\n"
431 " }\n"
432 " return;\n"
433 "}\n",
434 &count));
435 PetscFunctionReturn(PETSC_SUCCESS);
436 }
437
PetscFEOpenCLGetIntegrationKernel(PetscFE fem,PetscBool useAux,cl_program * ocl_prog,cl_kernel * ocl_kernel)438 static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel)
439 {
440 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
441 PetscInt dim, N_bl;
442 PetscBool flg;
443 char *buffer;
444 size_t len;
445 char errMsg[8192];
446 cl_int err;
447
448 PetscFunctionBegin;
449 PetscCall(PetscFEGetSpatialDimension(fem, &dim));
450 PetscCall(PetscMalloc1(8192, &buffer));
451 PetscCall(PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL));
452 PetscCall(PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl));
453 PetscCall(PetscOptionsHasName(((PetscObject)fem)->options, ((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg));
454 if (flg) PetscCall(PetscPrintf(PetscObjectComm((PetscObject)fem), "OpenCL FE Integration Kernel:\n%s\n", buffer));
455 PetscCall(PetscStrlen(buffer, &len));
456 *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **)&buffer, &len, &err);
457 PetscCall(err);
458 err = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL);
459 if (err != CL_SUCCESS) {
460 err = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192 * sizeof(char), &errMsg, NULL);
461 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg);
462 }
463 PetscCall(PetscFree(buffer));
464 *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &err);
465 PetscFunctionReturn(PETSC_SUCCESS);
466 }
467
PetscFEOpenCLCalculateGrid(PetscFE fem,PetscInt N,PetscInt blockSize,size_t * x,size_t * y,size_t * z)468 static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z)
469 {
470 const PetscInt Nblocks = N / blockSize;
471
472 PetscFunctionBegin;
473 PetscCheck(!(N % blockSize), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
474 *z = 1;
475 *y = 1;
476 for (*x = (size_t)(PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) {
477 *y = Nblocks / *x;
478 if (*x * *y == (size_t)Nblocks) break;
479 }
480 PetscCheck(*x * *y == (size_t)Nblocks, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %" PetscInt_FMT " with block size %" PetscInt_FMT, N, blockSize);
481 PetscFunctionReturn(PETSC_SUCCESS);
482 }
483
PetscFEOpenCLLogResidual(PetscFE fem,PetscLogDouble time,PetscLogDouble flops)484 static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops)
485 {
486 PetscLogHandler h;
487
488 PetscFunctionBegin;
489 PetscCall(PetscLogGetDefaultHandler(&h));
490 if (h) {
491 PetscEventPerfInfo *eventInfo;
492 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
493
494 PetscCall(PetscLogHandlerGetEventPerfInfo(h, PETSC_DEFAULT, ocl->residualEvent, &eventInfo));
495 eventInfo->count++;
496 eventInfo->time += time;
497 eventInfo->flops += flops;
498 }
499 PetscFunctionReturn(PETSC_SUCCESS);
500 }
501
PetscFEIntegrateResidual_OpenCL(PetscDS prob,PetscFormKey key,PetscInt Ne,PetscFEGeom * cgeom,const PetscScalar coefficients[],const PetscScalar coefficients_t[],PetscDS probAux,const PetscScalar coefficientsAux[],PetscReal t,PetscScalar elemVec[])502 static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscFormKey key, PetscInt Ne, PetscFEGeom *cgeom, const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[])
503 {
504 /* Nbc = batchSize */
505 PetscFE fem;
506 PetscFE_OpenCL *ocl;
507 PetscPointFn *f0_func;
508 PetscPointFn *f1_func;
509 PetscQuadrature q;
510 PetscInt dim, qNc;
511 PetscInt N_b; /* The number of basis functions */
512 PetscInt N_comp; /* The number of basis function components */
513 PetscInt N_bt; /* The total number of scalar basis functions */
514 PetscInt N_q; /* The number of quadrature points */
515 PetscInt N_bst; /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
516 PetscInt N_t; /* The number of threads, N_bst * N_bl */
517 PetscInt N_bl; /* The number of blocks */
518 PetscInt N_bc; /* The batch size, N_bl*N_q*N_b */
519 PetscInt N_cb; /* The number of batches */
520 const PetscInt field = key.field;
521 PetscInt numFlops, f0Flops = 0, f1Flops = 0;
522 PetscBool useAux = probAux ? PETSC_TRUE : PETSC_FALSE;
523 PetscBool useField = PETSC_FALSE;
524 PetscBool useFieldDer = PETSC_TRUE;
525 PetscBool useF0 = PETSC_TRUE;
526 PetscBool useF1 = PETSC_TRUE;
527 /* OpenCL variables */
528 cl_program ocl_prog;
529 cl_kernel ocl_kernel;
530 cl_event ocl_ev; /* The event for tracking kernel execution */
531 cl_ulong ns_start; /* Nanoseconds counter on GPU at kernel start */
532 cl_ulong ns_end; /* Nanoseconds counter on GPU at kernel stop */
533 cl_mem o_jacobianInverses, o_jacobianDeterminants;
534 cl_mem o_coefficients, o_coefficientsAux, o_elemVec;
535 float *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL;
536 double *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL;
537 PetscReal *r_invJ = NULL, *r_detJ = NULL;
538 void *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ;
539 size_t local_work_size[3], global_work_size[3];
540 size_t realSize, x, y, z;
541 const PetscReal *points, *weights;
542 int err;
543
544 PetscFunctionBegin;
545 PetscCall(PetscDSGetDiscretization(prob, field, (PetscObject *)&fem));
546 ocl = (PetscFE_OpenCL *)fem->data;
547 if (!Ne) {
548 PetscCall(PetscFEOpenCLLogResidual(fem, 0.0, 0.0));
549 PetscFunctionReturn(PETSC_SUCCESS);
550 }
551 PetscCall(PetscFEGetSpatialDimension(fem, &dim));
552 PetscCall(PetscFEGetQuadrature(fem, &q));
553 PetscCall(PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights));
554 PetscCheck(qNc == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %" PetscInt_FMT " components", qNc);
555 PetscCall(PetscFEGetDimension(fem, &N_b));
556 PetscCall(PetscFEGetNumComponents(fem, &N_comp));
557 PetscCall(PetscDSGetResidual(prob, field, &f0_func, &f1_func));
558 PetscCall(PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb));
559 N_bt = N_b * N_comp;
560 N_bst = N_bt * N_q;
561 N_t = N_bst * N_bl;
562 PetscCheck(N_bc * N_comp == N_t, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp);
563 /* Calculate layout */
564 if (Ne % (N_cb * N_bc)) { /* Remainder cells */
565 PetscCall(PetscFEIntegrateResidual_Basic(prob, key, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec));
566 PetscFunctionReturn(PETSC_SUCCESS);
567 }
568 PetscCall(PetscFEOpenCLCalculateGrid(fem, Ne, N_cb * N_bc, &x, &y, &z));
569 local_work_size[0] = N_bc * N_comp;
570 local_work_size[1] = 1;
571 local_work_size[2] = 1;
572 global_work_size[0] = x * local_work_size[0];
573 global_work_size[1] = y * local_work_size[1];
574 global_work_size[2] = z * local_work_size[2];
575 PetscCall(PetscInfo(fem, "GPU layout grid(%zu,%zu,%zu) block(%zu,%zu,%zu) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb));
576 PetscCall(PetscInfo(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb));
577 /* Generate code */
578 if (probAux) {
579 PetscSpace P;
580 PetscInt NfAux, order, f;
581
582 PetscCall(PetscDSGetNumFields(probAux, &NfAux));
583 for (f = 0; f < NfAux; ++f) {
584 PetscFE feAux;
585
586 PetscCall(PetscDSGetDiscretization(probAux, f, (PetscObject *)&feAux));
587 PetscCall(PetscFEGetBasisSpace(feAux, &P));
588 PetscCall(PetscSpaceGetDegree(P, &order, NULL));
589 PetscCheck(order <= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields");
590 }
591 }
592 PetscCall(PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel));
593 /* Create buffers on the device and send data over */
594 PetscCall(PetscDataTypeGetSize(ocl->realType, &realSize));
595 PetscCheck(cgeom->numPoints <= 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only support affine geometry for OpenCL integration right now");
596 if (sizeof(PetscReal) != realSize) {
597 switch (ocl->realType) {
598 case PETSC_FLOAT: {
599 PetscInt c, b, d;
600
601 PetscCall(PetscMalloc4(Ne * N_bt, &f_coeff, Ne, &f_coeffAux, Ne * dim * dim, &f_invJ, Ne, &f_detJ));
602 for (c = 0; c < Ne; ++c) {
603 f_detJ[c] = (float)cgeom->detJ[c];
604 for (d = 0; d < dim * dim; ++d) f_invJ[c * dim * dim + d] = (float)cgeom->invJ[c * dim * dim + d];
605 for (b = 0; b < N_bt; ++b) f_coeff[c * N_bt + b] = (float)coefficients[c * N_bt + b];
606 }
607 if (coefficientsAux) { /* Assume P0 */
608 for (c = 0; c < Ne; ++c) f_coeffAux[c] = (float)coefficientsAux[c];
609 }
610 oclCoeff = (void *)f_coeff;
611 if (coefficientsAux) {
612 oclCoeffAux = (void *)f_coeffAux;
613 } else {
614 oclCoeffAux = NULL;
615 }
616 oclInvJ = (void *)f_invJ;
617 oclDetJ = (void *)f_detJ;
618 } break;
619 case PETSC_DOUBLE: {
620 PetscInt c, b, d;
621
622 PetscCall(PetscMalloc4(Ne * N_bt, &d_coeff, Ne, &d_coeffAux, Ne * dim * dim, &d_invJ, Ne, &d_detJ));
623 for (c = 0; c < Ne; ++c) {
624 d_detJ[c] = (double)cgeom->detJ[c];
625 for (d = 0; d < dim * dim; ++d) d_invJ[c * dim * dim + d] = (double)cgeom->invJ[c * dim * dim + d];
626 for (b = 0; b < N_bt; ++b) d_coeff[c * N_bt + b] = (double)coefficients[c * N_bt + b];
627 }
628 if (coefficientsAux) { /* Assume P0 */
629 for (c = 0; c < Ne; ++c) d_coeffAux[c] = (double)coefficientsAux[c];
630 }
631 oclCoeff = (void *)d_coeff;
632 if (coefficientsAux) {
633 oclCoeffAux = (void *)d_coeffAux;
634 } else {
635 oclCoeffAux = NULL;
636 }
637 oclInvJ = (void *)d_invJ;
638 oclDetJ = (void *)d_detJ;
639 } break;
640 default:
641 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
642 }
643 } else {
644 PetscInt c, d;
645
646 PetscCall(PetscMalloc2(Ne * dim * dim, &r_invJ, Ne, &r_detJ));
647 for (c = 0; c < Ne; ++c) {
648 r_detJ[c] = cgeom->detJ[c];
649 for (d = 0; d < dim * dim; ++d) r_invJ[c * dim * dim + d] = cgeom->invJ[c * dim * dim + d];
650 }
651 oclCoeff = (void *)coefficients;
652 oclCoeffAux = (void *)coefficientsAux;
653 oclInvJ = (void *)r_invJ;
654 oclDetJ = (void *)r_detJ;
655 }
656 o_coefficients = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * N_bt * realSize, oclCoeff, &err);
657 if (coefficientsAux) {
658 o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclCoeffAux, &err);
659 } else {
660 o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY, Ne * realSize, oclCoeffAux, &err);
661 }
662 o_jacobianInverses = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * dim * dim * realSize, oclInvJ, &err);
663 o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclDetJ, &err);
664 o_elemVec = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY, Ne * N_bt * realSize, NULL, &err);
665 /* Kernel launch */
666 PetscCall(clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void *)&N_cb));
667 PetscCall(clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void *)&o_coefficients));
668 PetscCall(clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void *)&o_coefficientsAux));
669 PetscCall(clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void *)&o_jacobianInverses));
670 PetscCall(clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void *)&o_jacobianDeterminants));
671 PetscCall(clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void *)&o_elemVec));
672 PetscCall(clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev));
673 /* Read data back from device */
674 if (sizeof(PetscReal) != realSize) {
675 switch (ocl->realType) {
676 case PETSC_FLOAT: {
677 float *elem;
678 PetscInt c, b;
679
680 PetscCall(PetscFree4(f_coeff, f_coeffAux, f_invJ, f_detJ));
681 PetscCall(PetscMalloc1(Ne * N_bt, &elem));
682 PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elem, 0, NULL, NULL));
683 for (c = 0; c < Ne; ++c) {
684 for (b = 0; b < N_bt; ++b) elemVec[c * N_bt + b] = (PetscScalar)elem[c * N_bt + b];
685 }
686 PetscCall(PetscFree(elem));
687 } break;
688 case PETSC_DOUBLE: {
689 double *elem;
690 PetscInt c, b;
691
692 PetscCall(PetscFree4(d_coeff, d_coeffAux, d_invJ, d_detJ));
693 PetscCall(PetscMalloc1(Ne * N_bt, &elem));
694 PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elem, 0, NULL, NULL));
695 for (c = 0; c < Ne; ++c) {
696 for (b = 0; b < N_bt; ++b) elemVec[c * N_bt + b] = (PetscScalar)elem[c * N_bt + b];
697 }
698 PetscCall(PetscFree(elem));
699 } break;
700 default:
701 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
702 }
703 } else {
704 PetscCall(PetscFree2(r_invJ, r_detJ));
705 PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elemVec, 0, NULL, NULL));
706 }
707 /* Log performance */
708 PetscCall(clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL));
709 PetscCall(clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL));
710 f0Flops = 0;
711 switch (ocl->op) {
712 case LAPLACIAN:
713 f1Flops = useAux ? dim : 0;
714 break;
715 case ELASTICITY:
716 f1Flops = 2 * dim * dim;
717 break;
718 }
719 numFlops = Ne * (N_q * (N_b * N_comp * ((useField ? 2 : 0) + (useFieldDer ? 2 * dim * (dim + 1) : 0))
720 /*+
721 N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/
722 + N_comp * ((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2 * dim : 0))) +
723 N_b * ((useF0 ? 2 : 0) + (useF1 ? 2 * dim * (dim + 1) : 0)));
724 PetscCall(PetscFEOpenCLLogResidual(fem, (ns_end - ns_start) * 1.0e-9, numFlops));
725 /* Cleanup */
726 PetscCall(clReleaseMemObject(o_coefficients));
727 PetscCall(clReleaseMemObject(o_coefficientsAux));
728 PetscCall(clReleaseMemObject(o_jacobianInverses));
729 PetscCall(clReleaseMemObject(o_jacobianDeterminants));
730 PetscCall(clReleaseMemObject(o_elemVec));
731 PetscCall(clReleaseKernel(ocl_kernel));
732 PetscCall(clReleaseProgram(ocl_prog));
733 PetscFunctionReturn(PETSC_SUCCESS);
734 }
735
736 PETSC_INTERN PetscErrorCode PetscFESetUp_Basic(PetscFE);
737 PETSC_INTERN PetscErrorCode PetscFEComputeTabulation_Basic(PetscFE, PetscInt, const PetscReal[], PetscInt, PetscTabulation);
738
PetscFEInitialize_OpenCL(PetscFE fem)739 static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem)
740 {
741 PetscFunctionBegin;
742 fem->ops->setfromoptions = NULL;
743 fem->ops->setup = PetscFESetUp_Basic;
744 fem->ops->view = NULL;
745 fem->ops->destroy = PetscFEDestroy_OpenCL;
746 fem->ops->getdimension = PetscFEGetDimension_Basic;
747 fem->ops->computetabulation = PetscFEComputeTabulation_Basic;
748 fem->ops->integrateresidual = PetscFEIntegrateResidual_OpenCL;
749 fem->ops->integratebdresidual = NULL /* PetscFEIntegrateBdResidual_OpenCL */;
750 fem->ops->integratejacobianaction = NULL /* PetscFEIntegrateJacobianAction_OpenCL */;
751 fem->ops->integratejacobian = PetscFEIntegrateJacobian_Basic;
752 PetscFunctionReturn(PETSC_SUCCESS);
753 }
754
755 /*MC
756 PETSCFEOPENCL = "opencl" - A `PetscFEType` that integrates using a vectorized OpenCL implementation
757
758 Level: intermediate
759
760 .seealso: `PetscFEType`, `PetscFECreate()`, `PetscFESetType()`
761 M*/
762
PetscFECreate_OpenCL(PetscFE fem)763 PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem)
764 {
765 PetscFE_OpenCL *ocl;
766 cl_uint num_platforms;
767 cl_platform_id platform_ids[42];
768 cl_uint num_devices;
769 cl_device_id device_ids[42];
770 cl_int err;
771
772 PetscFunctionBegin;
773 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
774 PetscCall(PetscNew(&ocl));
775 fem->data = ocl;
776
777 /* Init Platform */
778 PetscCall(clGetPlatformIDs(42, platform_ids, &num_platforms));
779 PetscCheck(num_platforms, PetscObjectComm((PetscObject)fem), PETSC_ERR_SUP, "No OpenCL platform found.");
780 ocl->pf_id = platform_ids[0];
781 /* Init Device */
782 PetscCall(clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices));
783 PetscCheck(num_devices, PetscObjectComm((PetscObject)fem), PETSC_ERR_SUP, "No OpenCL device found.");
784 ocl->dev_id = device_ids[0];
785 /* Create context with one command queue */
786 ocl->ctx_id = clCreateContext(0, 1, &ocl->dev_id, NULL, NULL, &err);
787 PetscCall(err);
788 ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &err);
789 PetscCall(err);
790 /* Types */
791 ocl->realType = PETSC_FLOAT;
792 /* Register events */
793 PetscCall(PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent));
794 /* Equation handling */
795 ocl->op = LAPLACIAN;
796
797 PetscCall(PetscFEInitialize_OpenCL(fem));
798 PetscFunctionReturn(PETSC_SUCCESS);
799 }
800
801 /*@
802 PetscFEOpenCLSetRealType - Set the scalar type for running on the OpenCL accelerator
803
804 Input Parameters:
805 + fem - The `PetscFE`
806 - realType - The scalar type
807
808 Level: developer
809
810 .seealso: `PetscFE`, `PetscFEOpenCLGetRealType()`
811 @*/
PetscFEOpenCLSetRealType(PetscFE fem,PetscDataType realType)812 PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType)
813 {
814 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
815
816 PetscFunctionBegin;
817 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
818 ocl->realType = realType;
819 PetscFunctionReturn(PETSC_SUCCESS);
820 }
821
822 /*@
823 PetscFEOpenCLGetRealType - Get the scalar type for running on the OpenCL accelerator
824
825 Input Parameter:
826 . fem - The `PetscFE`
827
828 Output Parameter:
829 . realType - The scalar type
830
831 Level: developer
832
833 .seealso: `PetscFE`, `PetscFEOpenCLSetRealType()`
834 @*/
PetscFEOpenCLGetRealType(PetscFE fem,PetscDataType * realType)835 PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType)
836 {
837 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
838
839 PetscFunctionBegin;
840 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
841 PetscAssertPointer(realType, 2);
842 *realType = ocl->realType;
843 PetscFunctionReturn(PETSC_SUCCESS);
844 }
845
846 #endif /* PETSC_HAVE_OPENCL */
847