xref: /petsc/src/dm/dt/fe/impls/opencl/feopencl.c (revision 503c0ea9b45bcfbcebbb1ea5341243bbc69f0bea)
1 #include <petsc/private/petscfeimpl.h> /*I "petscfe.h" I*/
2 
3 #if defined(PETSC_HAVE_OPENCL)
4 
5 static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem)
6 {
7   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
8 
9   PetscFunctionBegin;
10   PetscCall(clReleaseCommandQueue(ocl->queue_id));
11   ocl->queue_id = 0;
12   PetscCall(clReleaseContext(ocl->ctx_id));
13   ocl->ctx_id = 0;
14   PetscCall(PetscFree(ocl));
15   PetscFunctionReturn(0);
16 }
17 
18 #define PetscCallSTR(err) do {PetscCall(err); string_tail += count; PetscCheck(string_tail != end_of_buffer,PETSC_COMM_SELF, PETSC_ERR_PLIB,"Buffer overflow");} while (0)
19 enum {LAPLACIAN = 0, ELASTICITY = 1};
20 
21 /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */
22 /* dim     Number of spatial dimensions:          2                   */
23 /* N_b     Number of basis functions:             generated           */
24 /* N_{bt}  Number of total basis functions:       N_b * N_{comp}      */
25 /* N_q     Number of quadrature points:           generated           */
26 /* N_{bs}  Number of block cells                  LCM(N_b, N_q)       */
27 /* N_{bst} Number of block cell components        LCM(N_{bt}, N_q)    */
28 /* N_{bl}  Number of concurrent blocks            generated           */
29 /* N_t     Number of threads:                     N_{bl} * N_{bs}     */
30 /* N_{cbc} Number of concurrent basis      cells: N_{bl} * N_q        */
31 /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b        */
32 /* N_{sbc} Number of serial     basis      cells: N_{bs} / N_q        */
33 /* N_{sqc} Number of serial     quadrature cells: N_{bs} / N_b        */
34 /* N_{cb}  Number of serial cell batches:         input               */
35 /* N_c     Number of total cells:                 N_{cb}*N_{t}/N_{comp} */
36 static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl)
37 {
38   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
39   PetscQuadrature q;
40   char           *string_tail   = *string_buffer;
41   char           *end_of_buffer = *string_buffer + buffer_length;
42   char            float_str[]   = "float", double_str[]  = "double";
43   char           *numeric_str   = &(float_str[0]);
44   PetscInt        op            = ocl->op;
45   PetscBool       useField      = PETSC_FALSE;
46   PetscBool       useFieldDer   = PETSC_TRUE;
47   PetscBool       useFieldAux   = useAux;
48   PetscBool       useFieldDerAux= PETSC_FALSE;
49   PetscBool       useF0         = PETSC_TRUE;
50   PetscBool       useF1         = PETSC_TRUE;
51   const PetscReal *points, *weights;
52   PetscTabulation T;
53   PetscInt        dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c;
54   size_t          count;
55 
56   PetscFunctionBegin;
57   PetscCall(PetscFEGetSpatialDimension(fem, &dim));
58   PetscCall(PetscFEGetDimension(fem, &N_b));
59   PetscCall(PetscFEGetNumComponents(fem, &N_c));
60   PetscCall(PetscFEGetQuadrature(fem, &q));
61   PetscCall(PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights));
62   PetscCheck(qNc == 1,PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %" PetscInt_FMT " components", qNc);
63   N_t  = N_b * N_c * N_q * N_bl;
64   /* Enable device extension for double precision */
65   if (ocl->realType == PETSC_DOUBLE) {
66     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
67 "#if defined(cl_khr_fp64)\n"
68 "#  pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
69 "#elif defined(cl_amd_fp64)\n"
70 "#  pragma OPENCL EXTENSION cl_amd_fp64: enable\n"
71 "#endif\n",
72                                  &count));
73     numeric_str  = &(double_str[0]);
74   }
75   /* Kernel API */
76   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
77 "\n"
78 "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n"
79 "{\n",
80                                &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str));
81   /* Quadrature */
82   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
83 "  /* Quadrature points\n"
84 "   - (x1,y1,x2,y2,...) */\n"
85 "  const %s points[%d] = {\n",
86                                &count, numeric_str, N_q*dim));
87   for (p = 0; p < N_q; ++p) {
88     for (d = 0; d < dim; ++d) {
89       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p*dim+d]));
90     }
91   }
92   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
93   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
94 "  /* Quadrature weights\n"
95 "   - (v1,v2,...) */\n"
96 "  const %s weights[%d] = {\n",
97                                &count, numeric_str, N_q));
98   for (p = 0; p < N_q; ++p) {
99     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p]));
100   }
101   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
102   /* Basis Functions */
103   PetscCall(PetscFEGetCellTabulation(fem, 1, &T));
104   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
105 "  /* Nodal basis function evaluations\n"
106 "    - basis component is fastest varying, the basis function, then point */\n"
107 "  const %s Basis[%d] = {\n",
108                                &count, numeric_str, N_q*N_b*N_c));
109   for (p = 0; p < N_q; ++p) {
110     for (b = 0; b < N_b; ++b) {
111       for (c = 0; c < N_c; ++c) {
112         PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p*N_b + b)*N_c + c]));
113       }
114     }
115   }
116   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
117   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
118 "\n"
119 "  /* Nodal basis function derivative evaluations,\n"
120 "      - derivative direction is fastest varying, then basis component, then basis function, then point */\n"
121 "  const %s%d BasisDerivatives[%d] = {\n",
122                             &count, numeric_str, dim, N_q*N_b*N_c));
123   for (p = 0; p < N_q; ++p) {
124     for (b = 0; b < N_b; ++b) {
125       for (c = 0; c < N_c; ++c) {
126         PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim));
127         for (d = 0; d < dim; ++d) {
128           if (d > 0) {
129             PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]));
130           } else {
131             PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]));
132           }
133         }
134         PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count));
135       }
136     }
137   }
138   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
139   /* Sizes */
140   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
141 "  const int dim    = %d;                           // The spatial dimension\n"
142 "  const int N_bl   = %d;                           // The number of concurrent blocks\n"
143 "  const int N_b    = %d;                           // The number of basis functions\n"
144 "  const int N_comp = %d;                           // The number of basis function components\n"
145 "  const int N_bt   = N_b*N_comp;                    // The total number of scalar basis functions\n"
146 "  const int N_q    = %d;                           // The number of quadrature points\n"
147 "  const int N_bst  = N_bt*N_q;                      // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n"
148 "  const int N_t    = N_bst*N_bl;                    // The number of threads, N_bst * N_bl\n"
149 "  const int N_bc   = N_t/N_comp;                    // The number of cells per batch (N_b*N_q*N_bl)\n"
150 "  const int N_sbc  = N_bst / (N_q * N_comp);\n"
151 "  const int N_sqc  = N_bst / N_bt;\n"
152 "  /*const int N_c    = N_cb * N_bc;*/\n"
153 "\n"
154 "  /* Calculated indices */\n"
155 "  /*const int tidx    = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n"
156 "  const int tidx    = get_local_id(0);\n"
157 "  const int blidx   = tidx / N_bst;                  // Block number for this thread\n"
158 "  const int bidx    = tidx %% N_bt;                   // Basis function mapped to this thread\n"
159 "  const int cidx    = tidx %% N_comp;                 // Basis component mapped to this thread\n"
160 "  const int qidx    = tidx %% N_q;                    // Quadrature point mapped to this thread\n"
161 "  const int blbidx  = tidx %% N_q + blidx*N_q;        // Cell mapped to this thread in the basis phase\n"
162 "  const int blqidx  = tidx %% N_b + blidx*N_b;        // Cell mapped to this thread in the quadrature phase\n"
163 "  const int gidx    = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n"
164 "  const int Goffset = gidx*N_cb*N_bc;\n",
165                                &count, dim, N_bl, N_b, N_c, N_q));
166   /* Local memory */
167   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
168 "\n"
169 "  /* Quadrature data */\n"
170 "  %s                w;                   // $w_q$, Quadrature weight at $x_q$\n"
171 "  __local %s         phi_i[%d];    //[N_bt*N_q];  // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n"
172 "  __local %s%d       phiDer_i[%d]; //[N_bt*N_q];  // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n"
173 "  /* Geometric data */\n"
174 "  __local %s        detJ[%d]; //[N_t];           // $|J(x_q)|$, Jacobian determinant at $x_q$\n"
175 "  __local %s        invJ[%d];//[N_t*dim*dim];   // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n",
176                                &count, numeric_str, numeric_str, N_b*N_c*N_q, numeric_str, dim, N_b*N_c*N_q, numeric_str, N_t,
177                                numeric_str, N_t*dim*dim, numeric_str, N_t*N_b*N_c));
178   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
179 "  /* FEM data */\n"
180 "  __local %s        u_i[%d]; //[N_t*N_bt];       // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n",
181                                &count, numeric_str, N_t*N_b*N_c));
182   if (useAux) {
183     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
184 "  __local %s        a_i[%d]; //[N_t];            // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n",
185                                  &count, numeric_str, N_t));
186   }
187   if (useF0) {
188     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
189 "  /* Intermediate calculations */\n"
190 "  __local %s         f_0[%d]; //[N_t*N_sqc];      // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
191                                  &count, numeric_str, N_t*N_q));
192   }
193   if (useF1) {
194     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
195 "  __local %s%d       f_1[%d]; //[N_t*N_sqc];      // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
196                                  &count, numeric_str, dim, N_t*N_q));
197   }
198   /* TODO: If using elasticity, put in mu/lambda coefficients */
199   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
200 "  /* Output data */\n"
201 "  %s                e_i;                 // Coefficient $e_i$ of the residual\n\n",
202                                &count, numeric_str));
203   /* One-time loads */
204   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
205 "  /* These should be generated inline */\n"
206 "  /* Load quadrature weights */\n"
207 "  w = weights[qidx];\n"
208 "  /* Load basis tabulation \\phi_i for this cell */\n"
209 "  if (tidx < N_bt*N_q) {\n"
210 "    phi_i[tidx]    = Basis[tidx];\n"
211 "    phiDer_i[tidx] = BasisDerivatives[tidx];\n"
212 "  }\n\n",
213                                &count));
214   /* Batch loads */
215   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
216 "  for (int batch = 0; batch < N_cb; ++batch) {\n"
217 "    /* Load geometry */\n"
218 "    detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n"
219 "    for (int n = 0; n < dim*dim; ++n) {\n"
220 "      const int offset = n*N_t;\n"
221 "      invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n"
222 "    }\n"
223 "    /* Load coefficients u_i for this cell */\n"
224 "    for (int n = 0; n < N_bt; ++n) {\n"
225 "      const int offset = n*N_t;\n"
226 "      u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n"
227 "    }\n",
228                                &count));
229   if (useAux) {
230     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
231 "    /* Load coefficients a_i for this cell */\n"
232 "    /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n"
233 "    a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n",
234                                  &count));
235   }
236   /* Quadrature phase */
237   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
238 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
239 "\n"
240 "    /* Map coefficients to values at quadrature points */\n"
241 "    for (int c = 0; c < N_sqc; ++c) {\n"
242 "      const int cell          = c*N_bl*N_b + blqidx;\n"
243 "      const int fidx          = (cell*N_q + qidx)*N_comp + cidx;\n",
244                                &count));
245   if (useField) {
246     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
247 "      %s  u[%d]; //[N_comp];     // $u(x_q)$, Value of the field at $x_q$\n",
248                                  &count, numeric_str, N_c));
249   }
250   if (useFieldDer) {
251     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
252 "      %s%d   gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n",
253                                  &count, numeric_str, dim, N_c));
254   }
255   if (useFieldAux) {
256     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
257 "      %s  a[%d]; //[1];     // $a(x_q)$, Value of the auxiliary fields at $x_q$\n",
258                                  &count, numeric_str, 1));
259   }
260   if (useFieldDerAux) {
261     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
262 "      %s%d   gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n",
263                                  &count, numeric_str, dim, 1));
264   }
265   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
266 "\n"
267 "      for (int comp = 0; comp < N_comp; ++comp) {\n",
268                                &count));
269   if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        u[comp] = 0.0;\n", &count));
270   if (useFieldDer) {
271     switch (dim) {
272     case 1:
273       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0;\n", &count));break;
274     case 2:
275       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count));break;
276     case 3:
277       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count));break;
278     }
279   }
280   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
281 "      }\n",
282                                &count));
283   if (useFieldAux) {
284     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      a[0] = 0.0;\n", &count));
285   }
286   if (useFieldDerAux) {
287     switch (dim) {
288     case 1:
289       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0;\n", &count));break;
290     case 2:
291       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count));break;
292     case 3:
293       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count));break;
294     }
295   }
296   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
297 "      /* Get field and derivatives at this quadrature point */\n"
298 "      for (int i = 0; i < N_b; ++i) {\n"
299 "        for (int comp = 0; comp < N_comp; ++comp) {\n"
300 "          const int b    = i*N_comp+comp;\n"
301 "          const int pidx = qidx*N_bt + b;\n"
302 "          const int uidx = cell*N_bt + b;\n"
303 "          %s%d   realSpaceDer;\n\n",
304                                &count, numeric_str, dim));
305   if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"          u[comp] += u_i[uidx]*phi_i[pidx];\n", &count));
306   if (useFieldDer) {
307     switch (dim) {
308     case 2:
309       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
310 "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
311 "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
312 "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
313 "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n",
314                                    &count));break;
315     case 3:
316       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
317 "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
318 "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
319 "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
320 "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n"
321 "          realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
322 "          gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n",
323                                    &count));break;
324     }
325   }
326   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
327 "        }\n"
328 "      }\n",
329                                &count));
330   if (useFieldAux) {
331     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"          a[0] += a_i[cell];\n", &count));
332   }
333   /* Calculate residual at quadrature points: Should be generated by an weak form egine */
334   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
335 "      /* Process values at quadrature points */\n",
336                                &count));
337   switch (op) {
338   case LAPLACIAN:
339     if (useF0) {PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] = 4.0;\n", &count));}
340     if (useF1) {
341       if (useAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx] = a[0]*gradU[cidx];\n", &count));
342       else        PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx] = gradU[cidx];\n", &count));
343     }
344     break;
345   case ELASTICITY:
346     if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] = 4.0;\n", &count));
347     if (useF1) {
348     switch (dim) {
349     case 2:
350       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
351 "      switch (cidx) {\n"
352 "      case 0:\n"
353 "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n"
354 "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n"
355 "        break;\n"
356 "      case 1:\n"
357 "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n"
358 "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n"
359 "      }\n",
360                                    &count));break;
361     case 3:
362       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
363 "      switch (cidx) {\n"
364 "      case 0:\n"
365 "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n"
366 "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n"
367 "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n"
368 "        break;\n"
369 "      case 1:\n"
370 "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n"
371 "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n"
372 "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n"
373 "        break;\n"
374 "      case 2:\n"
375 "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n"
376 "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n"
377 "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n"
378 "      }\n",
379                                    &count));break;
380     }}
381     break;
382   default:
383     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PDE operator %d is not supported", op);
384   }
385   if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_0[fidx] *= detJ[cell]*w;\n", &count));
386   if (useF1) {
387     switch (dim) {
388     case 1:
389       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_1[fidx].x *= detJ[cell]*w;\n", &count));break;
390     case 2:
391       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count));break;
392     case 3:
393       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count));break;
394     }
395   }
396   /* Thread transpose */
397   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
398 "    }\n\n"
399 "    /* ==== TRANSPOSE THREADS ==== */\n"
400 "    barrier(CLK_LOCAL_MEM_FENCE);\n\n",
401                                &count));
402   /* Basis phase */
403   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
404 "    /* Map values at quadrature points to coefficients */\n"
405 "    for (int c = 0; c < N_sbc; ++c) {\n"
406 "      const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n"
407 "\n"
408 "      e_i = 0.0;\n"
409 "      for (int q = 0; q < N_q; ++q) {\n"
410 "        const int pidx = q*N_bt + bidx;\n"
411 "        const int fidx = (cell*N_q + q)*N_comp + cidx;\n"
412 "        %s%d   realSpaceDer;\n\n",
413                                &count, numeric_str, dim));
414 
415   if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"        e_i += phi_i[pidx]*f_0[fidx];\n", &count));
416   if (useF1) {
417     switch (dim) {
418     case 2:
419       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
420 "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
421 "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
422 "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
423 "        e_i           += realSpaceDer.y*f_1[fidx].y;\n",
424                                    &count));break;
425     case 3:
426       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
427 "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
428 "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
429 "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
430 "        e_i           += realSpaceDer.y*f_1[fidx].y;\n"
431 "        realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
432 "        e_i           += realSpaceDer.z*f_1[fidx].z;\n",
433                                    &count));break;
434     }
435   }
436   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
437 "      }\n"
438 "      /* Write element vector for N_{cbc} cells at a time */\n"
439 "      elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n"
440 "    }\n"
441 "    /* ==== Could do one write per batch ==== */\n"
442 "  }\n"
443 "  return;\n"
444 "}\n",
445                                &count));
446   PetscFunctionReturn(0);
447 }
448 
449 static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel)
450 {
451   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
452   PetscInt        dim, N_bl;
453   PetscBool       flg;
454   char           *buffer;
455   size_t          len;
456   char            errMsg[8192];
457   cl_int          err;
458   PetscErrorCode  ierr;
459 
460   PetscFunctionBegin;
461   PetscCall(PetscFEGetSpatialDimension(fem, &dim));
462   PetscCall(PetscMalloc1(8192, &buffer));
463   PetscCall(PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL));
464   PetscCall(PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl));
465   PetscCall(PetscOptionsHasName(((PetscObject)fem)->options,((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg));
466   if (flg) PetscCall(PetscPrintf(PetscObjectComm((PetscObject) fem), "OpenCL FE Integration Kernel:\n%s\n", buffer));
467   PetscCall(PetscStrlen(buffer,&len));
468   *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **) &buffer, &len, &err);PetscCall(err);
469   err = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL);
470   if (err != CL_SUCCESS) {
471     err = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192*sizeof(char), &errMsg, NULL);
472     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg);
473   }
474   PetscCall(PetscFree(buffer));
475   *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &ierr);
476   PetscFunctionReturn(0);
477 }
478 
479 static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z)
480 {
481   const PetscInt Nblocks = N/blockSize;
482 
483   PetscFunctionBegin;
484   PetscCheck(!(N % blockSize),PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
485   *z = 1;
486   *y = 1;
487   for (*x = (size_t) (PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) {
488     *y = Nblocks / *x;
489     if (*x * *y == (size_t)Nblocks) break;
490   }
491   PetscCheck(*x * *y == (size_t)Nblocks,PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %D with block size %D", N, blockSize);
492   PetscFunctionReturn(0);
493 }
494 
495 static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops)
496 {
497   PetscFE_OpenCL   *ocl = (PetscFE_OpenCL *) fem->data;
498   PetscStageLog     stageLog;
499   PetscEventPerfLog eventLog = NULL;
500   int               stage;
501 
502   PetscFunctionBegin;
503   PetscCall(PetscLogGetStageLog(&stageLog));
504   PetscCall(PetscStageLogGetCurrent(stageLog, &stage));
505   PetscCall(PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog));
506     /* Log performance info */
507   eventLog->eventInfo[ocl->residualEvent].count++;
508   eventLog->eventInfo[ocl->residualEvent].time  += time;
509   eventLog->eventInfo[ocl->residualEvent].flops += flops;
510   PetscFunctionReturn(0);
511 }
512 
513 static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscFormKey key, PetscInt Ne, PetscFEGeom *cgeom,
514                                                       const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[])
515 {
516   /* Nbc = batchSize */
517   PetscFE           fem;
518   PetscFE_OpenCL   *ocl;
519   PetscPointFunc    f0_func;
520   PetscPointFunc    f1_func;
521   PetscQuadrature   q;
522   PetscInt          dim, qNc;
523   PetscInt          N_b;    /* The number of basis functions */
524   PetscInt          N_comp; /* The number of basis function components */
525   PetscInt          N_bt;   /* The total number of scalar basis functions */
526   PetscInt          N_q;    /* The number of quadrature points */
527   PetscInt          N_bst;  /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
528   PetscInt          N_t;    /* The number of threads, N_bst * N_bl */
529   PetscInt          N_bl;   /* The number of blocks */
530   PetscInt          N_bc;   /* The batch size, N_bl*N_q*N_b */
531   PetscInt          N_cb;   /* The number of batches */
532   const PetscInt    field = key.field;
533   PetscInt          numFlops, f0Flops = 0, f1Flops = 0;
534   PetscBool         useAux      = probAux ? PETSC_TRUE : PETSC_FALSE;
535   PetscBool         useField    = PETSC_FALSE;
536   PetscBool         useFieldDer = PETSC_TRUE;
537   PetscBool         useF0       = PETSC_TRUE;
538   PetscBool         useF1       = PETSC_TRUE;
539   /* OpenCL variables */
540   cl_program        ocl_prog;
541   cl_kernel         ocl_kernel;
542   cl_event          ocl_ev;         /* The event for tracking kernel execution */
543   cl_ulong          ns_start;       /* Nanoseconds counter on GPU at kernel start */
544   cl_ulong          ns_end;         /* Nanoseconds counter on GPU at kernel stop */
545   cl_mem            o_jacobianInverses, o_jacobianDeterminants;
546   cl_mem            o_coefficients, o_coefficientsAux, o_elemVec;
547   float            *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL;
548   double           *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL;
549   PetscReal        *r_invJ = NULL, *r_detJ = NULL;
550   void             *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ;
551   size_t            local_work_size[3], global_work_size[3];
552   size_t            realSize, x, y, z;
553   const PetscReal   *points, *weights;
554   PetscErrorCode    ierr;
555 
556   PetscFunctionBegin;
557   PetscCall(PetscDSGetDiscretization(prob, field, (PetscObject *) &fem));
558   ocl  = (PetscFE_OpenCL *) fem->data;
559   if (!Ne) {PetscCall(PetscFEOpenCLLogResidual(fem, 0.0, 0.0)); PetscFunctionReturn(0);}
560   PetscCall(PetscFEGetSpatialDimension(fem, &dim));
561   PetscCall(PetscFEGetQuadrature(fem, &q));
562   PetscCall(PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights));
563   PetscCheck(qNc == 1,PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components", qNc);
564   PetscCall(PetscFEGetDimension(fem, &N_b));
565   PetscCall(PetscFEGetNumComponents(fem, &N_comp));
566   PetscCall(PetscDSGetResidual(prob, field, &f0_func, &f1_func));
567   PetscCall(PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb));
568   N_bt  = N_b*N_comp;
569   N_bst = N_bt*N_q;
570   N_t   = N_bst*N_bl;
571   PetscCheck(N_bc*N_comp == N_t,PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp);
572   /* Calculate layout */
573   if (Ne % (N_cb*N_bc)) { /* Remainder cells */
574     PetscCall(PetscFEIntegrateResidual_Basic(prob, key, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec));
575     PetscFunctionReturn(0);
576   }
577   PetscCall(PetscFEOpenCLCalculateGrid(fem, Ne, N_cb*N_bc, &x, &y, &z));
578   local_work_size[0]  = N_bc*N_comp;
579   local_work_size[1]  = 1;
580   local_work_size[2]  = 1;
581   global_work_size[0] = x * local_work_size[0];
582   global_work_size[1] = y * local_work_size[1];
583   global_work_size[2] = z * local_work_size[2];
584   PetscCall(PetscInfo(fem, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb));
585   PetscCall(PetscInfo(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb));
586   /* Generate code */
587   if (probAux) {
588     PetscSpace P;
589     PetscInt   NfAux, order, f;
590 
591     PetscCall(PetscDSGetNumFields(probAux, &NfAux));
592     for (f = 0; f < NfAux; ++f) {
593       PetscFE feAux;
594 
595       PetscCall(PetscDSGetDiscretization(probAux, f, (PetscObject *) &feAux));
596       PetscCall(PetscFEGetBasisSpace(feAux, &P));
597       PetscCall(PetscSpaceGetDegree(P, &order, NULL));
598       PetscCheck(order <= 0,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields");
599     }
600   }
601   PetscCall(PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel));
602   /* Create buffers on the device and send data over */
603   PetscCall(PetscDataTypeGetSize(ocl->realType, &realSize));
604   PetscCheck(cgeom->numPoints <= 1,PETSC_COMM_SELF, PETSC_ERR_SUP, "Only support affine geometry for OpenCL integration right now");
605   if (sizeof(PetscReal) != realSize) {
606     switch (ocl->realType) {
607     case PETSC_FLOAT:
608     {
609       PetscInt c, b, d;
610 
611       PetscCall(PetscMalloc4(Ne*N_bt,&f_coeff,Ne,&f_coeffAux,Ne*dim*dim,&f_invJ,Ne,&f_detJ));
612       for (c = 0; c < Ne; ++c) {
613         f_detJ[c] = (float) cgeom->detJ[c];
614         for (d = 0; d < dim*dim; ++d) {
615           f_invJ[c*dim*dim+d] = (float) cgeom->invJ[c * dim * dim + d];
616         }
617         for (b = 0; b < N_bt; ++b) {
618           f_coeff[c*N_bt+b] = (float) coefficients[c*N_bt+b];
619         }
620       }
621       if (coefficientsAux) { /* Assume P0 */
622         for (c = 0; c < Ne; ++c) {
623           f_coeffAux[c] = (float) coefficientsAux[c];
624         }
625       }
626       oclCoeff      = (void *) f_coeff;
627       if (coefficientsAux) {
628         oclCoeffAux = (void *) f_coeffAux;
629       } else {
630         oclCoeffAux = NULL;
631       }
632       oclInvJ       = (void *) f_invJ;
633       oclDetJ       = (void *) f_detJ;
634     }
635     break;
636     case PETSC_DOUBLE:
637     {
638       PetscInt c, b, d;
639 
640       PetscCall(PetscMalloc4(Ne*N_bt,&d_coeff,Ne,&d_coeffAux,Ne*dim*dim,&d_invJ,Ne,&d_detJ));
641       for (c = 0; c < Ne; ++c) {
642         d_detJ[c] = (double) cgeom->detJ[c];
643         for (d = 0; d < dim*dim; ++d) {
644           d_invJ[c*dim*dim+d] = (double) cgeom->invJ[c * dim * dim + d];
645         }
646         for (b = 0; b < N_bt; ++b) {
647           d_coeff[c*N_bt+b] = (double) coefficients[c*N_bt+b];
648         }
649       }
650       if (coefficientsAux) { /* Assume P0 */
651         for (c = 0; c < Ne; ++c) {
652           d_coeffAux[c] = (double) coefficientsAux[c];
653         }
654       }
655       oclCoeff      = (void *) d_coeff;
656       if (coefficientsAux) {
657         oclCoeffAux = (void *) d_coeffAux;
658       } else {
659         oclCoeffAux = NULL;
660       }
661       oclInvJ       = (void *) d_invJ;
662       oclDetJ       = (void *) d_detJ;
663     }
664     break;
665     default:
666       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
667     }
668   } else {
669     PetscInt c, d;
670 
671     PetscCall(PetscMalloc2(Ne*dim*dim,&r_invJ,Ne,&r_detJ));
672     for (c = 0; c < Ne; ++c) {
673       r_detJ[c] = cgeom->detJ[c];
674       for (d = 0; d < dim*dim; ++d) {
675         r_invJ[c*dim*dim+d] = cgeom->invJ[c * dim * dim + d];
676       }
677     }
678     oclCoeff    = (void *) coefficients;
679     oclCoeffAux = (void *) coefficientsAux;
680     oclInvJ     = (void *) r_invJ;
681     oclDetJ     = (void *) r_detJ;
682   }
683   o_coefficients         = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*N_bt    * realSize, oclCoeff,    &ierr);
684   if (coefficientsAux) {
685     o_coefficientsAux    = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne         * realSize, oclCoeffAux, &ierr);
686   } else {
687     o_coefficientsAux    = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY,                        Ne         * realSize, oclCoeffAux, &ierr);
688   }
689   o_jacobianInverses     = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * realSize, oclInvJ,     &ierr);
690   o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne         * realSize, oclDetJ,     &ierr);
691   o_elemVec              = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY,                       Ne*N_bt    * realSize, NULL,        &ierr);
692   /* Kernel launch */
693   PetscCall(clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*) &N_cb));
694   PetscCall(clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*) &o_coefficients));
695   PetscCall(clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*) &o_coefficientsAux));
696   PetscCall(clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*) &o_jacobianInverses));
697   PetscCall(clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*) &o_jacobianDeterminants));
698   PetscCall(clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void*) &o_elemVec));
699   PetscCall(clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev));
700   /* Read data back from device */
701   if (sizeof(PetscReal) != realSize) {
702     switch (ocl->realType) {
703     case PETSC_FLOAT:
704     {
705       float   *elem;
706       PetscInt c, b;
707 
708       PetscCall(PetscFree4(f_coeff,f_coeffAux,f_invJ,f_detJ));
709       PetscCall(PetscMalloc1(Ne*N_bt, &elem));
710       PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL));
711       for (c = 0; c < Ne; ++c) {
712         for (b = 0; b < N_bt; ++b) {
713           elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
714         }
715       }
716       PetscCall(PetscFree(elem));
717     }
718     break;
719     case PETSC_DOUBLE:
720     {
721       double  *elem;
722       PetscInt c, b;
723 
724       PetscCall(PetscFree4(d_coeff,d_coeffAux,d_invJ,d_detJ));
725       PetscCall(PetscMalloc1(Ne*N_bt, &elem));
726       PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL));
727       for (c = 0; c < Ne; ++c) {
728         for (b = 0; b < N_bt; ++b) {
729           elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
730         }
731       }
732       PetscCall(PetscFree(elem));
733     }
734     break;
735     default:
736       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
737     }
738   } else {
739     PetscCall(PetscFree2(r_invJ,r_detJ));
740     PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elemVec, 0, NULL, NULL));
741   }
742   /* Log performance */
743   PetscCall(clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL));
744   PetscCall(clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END,   sizeof(cl_ulong), &ns_end,   NULL));
745   f0Flops = 0;
746   switch (ocl->op) {
747   case LAPLACIAN:
748     f1Flops = useAux ? dim : 0;break;
749   case ELASTICITY:
750     f1Flops = 2*dim*dim;break;
751   }
752   numFlops = Ne*(
753     N_q*(
754       N_b*N_comp*((useField ? 2 : 0) + (useFieldDer ? 2*dim*(dim + 1) : 0))
755       /*+
756        N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/
757       +
758       N_comp*((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2*dim : 0)))
759     +
760     N_b*((useF0 ? 2 : 0) + (useF1 ? 2*dim*(dim + 1) : 0)));
761   PetscCall(PetscFEOpenCLLogResidual(fem, (ns_end - ns_start)*1.0e-9, numFlops));
762   /* Cleanup */
763   PetscCall(clReleaseMemObject(o_coefficients));
764   PetscCall(clReleaseMemObject(o_coefficientsAux));
765   PetscCall(clReleaseMemObject(o_jacobianInverses));
766   PetscCall(clReleaseMemObject(o_jacobianDeterminants));
767   PetscCall(clReleaseMemObject(o_elemVec));
768   PetscCall(clReleaseKernel(ocl_kernel));
769   PetscCall(clReleaseProgram(ocl_prog));
770   PetscFunctionReturn(0);
771 }
772 
773 PETSC_INTERN PetscErrorCode PetscFESetUp_Basic(PetscFE);
774 PETSC_INTERN PetscErrorCode PetscFECreateTabulation_Basic(PetscFE, PetscInt, const PetscReal [], PetscInt, PetscTabulation);
775 
776 static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem)
777 {
778   PetscFunctionBegin;
779   fem->ops->setfromoptions          = NULL;
780   fem->ops->setup                   = PetscFESetUp_Basic;
781   fem->ops->view                    = NULL;
782   fem->ops->destroy                 = PetscFEDestroy_OpenCL;
783   fem->ops->getdimension            = PetscFEGetDimension_Basic;
784   fem->ops->createtabulation        = PetscFECreateTabulation_Basic;
785   fem->ops->integrateresidual       = PetscFEIntegrateResidual_OpenCL;
786   fem->ops->integratebdresidual     = NULL/* PetscFEIntegrateBdResidual_OpenCL */;
787   fem->ops->integratejacobianaction = NULL/* PetscFEIntegrateJacobianAction_OpenCL */;
788   fem->ops->integratejacobian       = PetscFEIntegrateJacobian_Basic;
789   PetscFunctionReturn(0);
790 }
791 
792 /*MC
793   PETSCFEOPENCL = "opencl" - A PetscFE object that integrates using a vectorized OpenCL implementation
794 
795   Level: intermediate
796 
797 .seealso: PetscFEType, PetscFECreate(), PetscFESetType()
798 M*/
799 
800 PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem)
801 {
802   PetscFE_OpenCL *ocl;
803   cl_uint         num_platforms;
804   cl_platform_id  platform_ids[42];
805   cl_uint         num_devices;
806   cl_device_id    device_ids[42];
807   cl_int          err;
808 
809   PetscFunctionBegin;
810   PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
811   PetscCall(PetscNewLog(fem,&ocl));
812   fem->data = ocl;
813 
814   /* Init Platform */
815   PetscCall(clGetPlatformIDs(42, platform_ids, &num_platforms));
816   PetscCheck(num_platforms,PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL platform found.");
817   ocl->pf_id = platform_ids[0];
818   /* Init Device */
819   PetscCall(clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices));
820   PetscCheck(num_devices,PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL device found.");
821   ocl->dev_id = device_ids[0];
822   /* Create context with one command queue */
823   ocl->ctx_id   = clCreateContext(0, 1, &(ocl->dev_id), NULL, NULL, &err);PetscCall(err);
824   ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &err);PetscCall(err);
825   /* Types */
826   ocl->realType = PETSC_FLOAT;
827   /* Register events */
828   PetscCall(PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent));
829   /* Equation handling */
830   ocl->op = LAPLACIAN;
831 
832   PetscCall(PetscFEInitialize_OpenCL(fem));
833   PetscFunctionReturn(0);
834 }
835 
836 /*@
837   PetscFEOpenCLSetRealType - Set the scalar type for running on the accelerator
838 
839   Input Parameters:
840 + fem      - The PetscFE
841 - realType - The scalar type
842 
843   Level: developer
844 
845 .seealso: PetscFEOpenCLGetRealType()
846 @*/
847 PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType)
848 {
849   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
850 
851   PetscFunctionBegin;
852   PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
853   ocl->realType = realType;
854   PetscFunctionReturn(0);
855 }
856 
857 /*@
858   PetscFEOpenCLGetRealType - Get the scalar type for running on the accelerator
859 
860   Input Parameter:
861 . fem      - The PetscFE
862 
863   Output Parameter:
864 . realType - The scalar type
865 
866   Level: developer
867 
868 .seealso: PetscFEOpenCLSetRealType()
869 @*/
870 PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType)
871 {
872   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
873 
874   PetscFunctionBegin;
875   PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
876   PetscValidPointer(realType, 2);
877   *realType = ocl->realType;
878   PetscFunctionReturn(0);
879 }
880 
881 #endif /* PETSC_HAVE_OPENCL */
882