120cf1dd8SToby Isaac #include <petsc/private/petscfeimpl.h> /*I "petscfe.h" I*/
2b665b14eSToby Isaac #include <petsc/private/loghandlerimpl.h>
3b665b14eSToby Isaac #include <../src/sys/logging/handler/impls/default/logdefault.h>
420cf1dd8SToby Isaac
57be5e748SToby Isaac #if defined(PETSC_HAVE_OPENCL)
620cf1dd8SToby Isaac
PetscFEDestroy_OpenCL(PetscFE fem)7d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem)
8d71ae5a4SJacob Faibussowitsch {
920cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
1020cf1dd8SToby Isaac
1120cf1dd8SToby Isaac PetscFunctionBegin;
129566063dSJacob Faibussowitsch PetscCall(clReleaseCommandQueue(ocl->queue_id));
1320cf1dd8SToby Isaac ocl->queue_id = 0;
149566063dSJacob Faibussowitsch PetscCall(clReleaseContext(ocl->ctx_id));
1520cf1dd8SToby Isaac ocl->ctx_id = 0;
169566063dSJacob Faibussowitsch PetscCall(PetscFree(ocl));
173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1820cf1dd8SToby Isaac }
1920cf1dd8SToby Isaac
209371c9d4SSatish Balay #define PetscCallSTR(err) \
219371c9d4SSatish Balay do { \
229371c9d4SSatish Balay PetscCall(err); \
239371c9d4SSatish Balay string_tail += count; \
249371c9d4SSatish Balay PetscCheck(string_tail != end_of_buffer, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Buffer overflow"); \
259371c9d4SSatish Balay } while (0)
269371c9d4SSatish Balay enum {
279371c9d4SSatish Balay LAPLACIAN = 0,
289371c9d4SSatish Balay ELASTICITY = 1
299371c9d4SSatish Balay };
3020cf1dd8SToby Isaac
3120cf1dd8SToby Isaac /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */
3220cf1dd8SToby Isaac /* dim Number of spatial dimensions: 2 */
3320cf1dd8SToby Isaac /* N_b Number of basis functions: generated */
3420cf1dd8SToby Isaac /* N_{bt} Number of total basis functions: N_b * N_{comp} */
3520cf1dd8SToby Isaac /* N_q Number of quadrature points: generated */
3620cf1dd8SToby Isaac /* N_{bs} Number of block cells LCM(N_b, N_q) */
3720cf1dd8SToby Isaac /* N_{bst} Number of block cell components LCM(N_{bt}, N_q) */
3820cf1dd8SToby Isaac /* N_{bl} Number of concurrent blocks generated */
3920cf1dd8SToby Isaac /* N_t Number of threads: N_{bl} * N_{bs} */
4020cf1dd8SToby Isaac /* N_{cbc} Number of concurrent basis cells: N_{bl} * N_q */
4120cf1dd8SToby Isaac /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b */
4220cf1dd8SToby Isaac /* N_{sbc} Number of serial basis cells: N_{bs} / N_q */
4320cf1dd8SToby Isaac /* N_{sqc} Number of serial quadrature cells: N_{bs} / N_b */
4420cf1dd8SToby Isaac /* N_{cb} Number of serial cell batches: input */
4520cf1dd8SToby Isaac /* N_c Number of total cells: N_{cb}*N_{t}/N_{comp} */
PetscFEOpenCLGenerateIntegrationCode(PetscFE fem,char ** string_buffer,PetscInt buffer_length,PetscBool useAux,PetscInt N_bl)46d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl)
47d71ae5a4SJacob Faibussowitsch {
4820cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
4920cf1dd8SToby Isaac PetscQuadrature q;
5020cf1dd8SToby Isaac char *string_tail = *string_buffer;
5120cf1dd8SToby Isaac char *end_of_buffer = *string_buffer + buffer_length;
5220cf1dd8SToby Isaac char float_str[] = "float", double_str[] = "double";
53f4f49eeaSPierre Jolivet char *numeric_str = &float_str[0];
5420cf1dd8SToby Isaac PetscInt op = ocl->op;
5520cf1dd8SToby Isaac PetscBool useField = PETSC_FALSE;
5620cf1dd8SToby Isaac PetscBool useFieldDer = PETSC_TRUE;
5720cf1dd8SToby Isaac PetscBool useFieldAux = useAux;
5820cf1dd8SToby Isaac PetscBool useFieldDerAux = PETSC_FALSE;
5920cf1dd8SToby Isaac PetscBool useF0 = PETSC_TRUE;
6020cf1dd8SToby Isaac PetscBool useF1 = PETSC_TRUE;
6120cf1dd8SToby Isaac const PetscReal *points, *weights;
62ef0bb6c7SMatthew G. Knepley PetscTabulation T;
6320cf1dd8SToby Isaac PetscInt dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c;
6420cf1dd8SToby Isaac size_t count;
6520cf1dd8SToby Isaac
6620cf1dd8SToby Isaac PetscFunctionBegin;
679566063dSJacob Faibussowitsch PetscCall(PetscFEGetSpatialDimension(fem, &dim));
689566063dSJacob Faibussowitsch PetscCall(PetscFEGetDimension(fem, &N_b));
699566063dSJacob Faibussowitsch PetscCall(PetscFEGetNumComponents(fem, &N_c));
709566063dSJacob Faibussowitsch PetscCall(PetscFEGetQuadrature(fem, &q));
719566063dSJacob Faibussowitsch PetscCall(PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights));
725f80ce2aSJacob Faibussowitsch PetscCheck(qNc == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %" PetscInt_FMT " components", qNc);
7320cf1dd8SToby Isaac N_t = N_b * N_c * N_q * N_bl;
7420cf1dd8SToby Isaac /* Enable device extension for double precision */
7520cf1dd8SToby Isaac if (ocl->realType == PETSC_DOUBLE) {
769566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
7720cf1dd8SToby Isaac "#if defined(cl_khr_fp64)\n"
7820cf1dd8SToby Isaac "# pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
7920cf1dd8SToby Isaac "#elif defined(cl_amd_fp64)\n"
8020cf1dd8SToby Isaac "# pragma OPENCL EXTENSION cl_amd_fp64: enable\n"
8120cf1dd8SToby Isaac "#endif\n",
825f80ce2aSJacob Faibussowitsch &count));
83f4f49eeaSPierre Jolivet numeric_str = &double_str[0];
8420cf1dd8SToby Isaac }
8520cf1dd8SToby Isaac /* Kernel API */
869566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
8720cf1dd8SToby Isaac "\n"
8820cf1dd8SToby Isaac "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n"
8920cf1dd8SToby Isaac "{\n",
905f80ce2aSJacob Faibussowitsch &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str));
9120cf1dd8SToby Isaac /* Quadrature */
929566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
9320cf1dd8SToby Isaac " /* Quadrature points\n"
9420cf1dd8SToby Isaac " - (x1,y1,x2,y2,...) */\n"
9520cf1dd8SToby Isaac " const %s points[%d] = {\n",
965f80ce2aSJacob Faibussowitsch &count, numeric_str, N_q * dim));
9720cf1dd8SToby Isaac for (p = 0; p < N_q; ++p) {
9848a46eb9SPierre Jolivet for (d = 0; d < dim; ++d) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p * dim + d]));
9920cf1dd8SToby Isaac }
1009566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
1019566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
10220cf1dd8SToby Isaac " /* Quadrature weights\n"
10320cf1dd8SToby Isaac " - (v1,v2,...) */\n"
10420cf1dd8SToby Isaac " const %s weights[%d] = {\n",
1055f80ce2aSJacob Faibussowitsch &count, numeric_str, N_q));
10648a46eb9SPierre Jolivet for (p = 0; p < N_q; ++p) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p]));
1079566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
10820cf1dd8SToby Isaac /* Basis Functions */
1099566063dSJacob Faibussowitsch PetscCall(PetscFEGetCellTabulation(fem, 1, &T));
1109566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
11120cf1dd8SToby Isaac " /* Nodal basis function evaluations\n"
11220cf1dd8SToby Isaac " - basis component is fastest varying, the basis function, then point */\n"
11320cf1dd8SToby Isaac " const %s Basis[%d] = {\n",
1145f80ce2aSJacob Faibussowitsch &count, numeric_str, N_q * N_b * N_c));
11520cf1dd8SToby Isaac for (p = 0; p < N_q; ++p) {
11620cf1dd8SToby Isaac for (b = 0; b < N_b; ++b) {
11748a46eb9SPierre Jolivet for (c = 0; c < N_c; ++c) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p * N_b + b) * N_c + c]));
11820cf1dd8SToby Isaac }
11920cf1dd8SToby Isaac }
1209566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
1219566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
12220cf1dd8SToby Isaac "\n"
12320cf1dd8SToby Isaac " /* Nodal basis function derivative evaluations,\n"
12420cf1dd8SToby Isaac " - derivative direction is fastest varying, then basis component, then basis function, then point */\n"
12520cf1dd8SToby Isaac " const %s%d BasisDerivatives[%d] = {\n",
1265f80ce2aSJacob Faibussowitsch &count, numeric_str, dim, N_q * N_b * N_c));
12720cf1dd8SToby Isaac for (p = 0; p < N_q; ++p) {
12820cf1dd8SToby Isaac for (b = 0; b < N_b; ++b) {
12920cf1dd8SToby Isaac for (c = 0; c < N_c; ++c) {
1309566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim));
13120cf1dd8SToby Isaac for (d = 0; d < dim; ++d) {
13220cf1dd8SToby Isaac if (d > 0) {
1339566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p * N_b + b) * dim + d) * N_c + c]));
13420cf1dd8SToby Isaac } else {
1359566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p * N_b + b) * dim + d) * N_c + c]));
13620cf1dd8SToby Isaac }
13720cf1dd8SToby Isaac }
1389566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count));
13920cf1dd8SToby Isaac }
14020cf1dd8SToby Isaac }
14120cf1dd8SToby Isaac }
1429566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count));
14320cf1dd8SToby Isaac /* Sizes */
1449566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
14520cf1dd8SToby Isaac " const int dim = %d; // The spatial dimension\n"
14620cf1dd8SToby Isaac " const int N_bl = %d; // The number of concurrent blocks\n"
14720cf1dd8SToby Isaac " const int N_b = %d; // The number of basis functions\n"
14820cf1dd8SToby Isaac " const int N_comp = %d; // The number of basis function components\n"
14920cf1dd8SToby Isaac " const int N_bt = N_b*N_comp; // The total number of scalar basis functions\n"
15020cf1dd8SToby Isaac " const int N_q = %d; // The number of quadrature points\n"
15120cf1dd8SToby Isaac " const int N_bst = N_bt*N_q; // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n"
15220cf1dd8SToby Isaac " const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl\n"
15320cf1dd8SToby Isaac " const int N_bc = N_t/N_comp; // The number of cells per batch (N_b*N_q*N_bl)\n"
15420cf1dd8SToby Isaac " const int N_sbc = N_bst / (N_q * N_comp);\n"
15520cf1dd8SToby Isaac " const int N_sqc = N_bst / N_bt;\n"
15620cf1dd8SToby Isaac " /*const int N_c = N_cb * N_bc;*/\n"
15720cf1dd8SToby Isaac "\n"
15820cf1dd8SToby Isaac " /* Calculated indices */\n"
15920cf1dd8SToby Isaac " /*const int tidx = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n"
16020cf1dd8SToby Isaac " const int tidx = get_local_id(0);\n"
16120cf1dd8SToby Isaac " const int blidx = tidx / N_bst; // Block number for this thread\n"
16220cf1dd8SToby Isaac " const int bidx = tidx %% N_bt; // Basis function mapped to this thread\n"
16320cf1dd8SToby Isaac " const int cidx = tidx %% N_comp; // Basis component mapped to this thread\n"
16420cf1dd8SToby Isaac " const int qidx = tidx %% N_q; // Quadrature point mapped to this thread\n"
16520cf1dd8SToby Isaac " const int blbidx = tidx %% N_q + blidx*N_q; // Cell mapped to this thread in the basis phase\n"
16620cf1dd8SToby Isaac " const int blqidx = tidx %% N_b + blidx*N_b; // Cell mapped to this thread in the quadrature phase\n"
16720cf1dd8SToby Isaac " const int gidx = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n"
16820cf1dd8SToby Isaac " const int Goffset = gidx*N_cb*N_bc;\n",
1695f80ce2aSJacob Faibussowitsch &count, dim, N_bl, N_b, N_c, N_q));
17020cf1dd8SToby Isaac /* Local memory */
1719566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
17220cf1dd8SToby Isaac "\n"
17320cf1dd8SToby Isaac " /* Quadrature data */\n"
17420cf1dd8SToby Isaac " %s w; // $w_q$, Quadrature weight at $x_q$\n"
17520cf1dd8SToby Isaac " __local %s phi_i[%d]; //[N_bt*N_q]; // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n"
17620cf1dd8SToby Isaac " __local %s%d phiDer_i[%d]; //[N_bt*N_q]; // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n"
17720cf1dd8SToby Isaac " /* Geometric data */\n"
17820cf1dd8SToby Isaac " __local %s detJ[%d]; //[N_t]; // $|J(x_q)|$, Jacobian determinant at $x_q$\n"
17920cf1dd8SToby Isaac " __local %s invJ[%d];//[N_t*dim*dim]; // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n",
1809371c9d4SSatish Balay &count, numeric_str, numeric_str, N_b * N_c * N_q, numeric_str, dim, N_b * N_c * N_q, numeric_str, N_t, numeric_str, N_t * dim * dim));
1819566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
18220cf1dd8SToby Isaac " /* FEM data */\n"
18320cf1dd8SToby Isaac " __local %s u_i[%d]; //[N_t*N_bt]; // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n",
1845f80ce2aSJacob Faibussowitsch &count, numeric_str, N_t * N_b * N_c));
18520cf1dd8SToby Isaac if (useAux) {
1869371c9d4SSatish Balay PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " __local %s a_i[%d]; //[N_t]; // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n", &count, numeric_str, N_t));
18720cf1dd8SToby Isaac }
18820cf1dd8SToby Isaac if (useF0) {
1899566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
19020cf1dd8SToby Isaac " /* Intermediate calculations */\n"
19120cf1dd8SToby Isaac " __local %s f_0[%d]; //[N_t*N_sqc]; // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
1925f80ce2aSJacob Faibussowitsch &count, numeric_str, N_t * N_q));
19320cf1dd8SToby Isaac }
19448a46eb9SPierre Jolivet if (useF1) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " __local %s%d f_1[%d]; //[N_t*N_sqc]; // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", &count, numeric_str, dim, N_t * N_q));
19520cf1dd8SToby Isaac /* TODO: If using elasticity, put in mu/lambda coefficients */
1969566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
19720cf1dd8SToby Isaac " /* Output data */\n"
19820cf1dd8SToby Isaac " %s e_i; // Coefficient $e_i$ of the residual\n\n",
1995f80ce2aSJacob Faibussowitsch &count, numeric_str));
20020cf1dd8SToby Isaac /* One-time loads */
2019566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
20220cf1dd8SToby Isaac " /* These should be generated inline */\n"
20320cf1dd8SToby Isaac " /* Load quadrature weights */\n"
20420cf1dd8SToby Isaac " w = weights[qidx];\n"
20520cf1dd8SToby Isaac " /* Load basis tabulation \\phi_i for this cell */\n"
20620cf1dd8SToby Isaac " if (tidx < N_bt*N_q) {\n"
20720cf1dd8SToby Isaac " phi_i[tidx] = Basis[tidx];\n"
20820cf1dd8SToby Isaac " phiDer_i[tidx] = BasisDerivatives[tidx];\n"
20920cf1dd8SToby Isaac " }\n\n",
2105f80ce2aSJacob Faibussowitsch &count));
21120cf1dd8SToby Isaac /* Batch loads */
2129566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
21320cf1dd8SToby Isaac " for (int batch = 0; batch < N_cb; ++batch) {\n"
21420cf1dd8SToby Isaac " /* Load geometry */\n"
21520cf1dd8SToby Isaac " detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n"
21620cf1dd8SToby Isaac " for (int n = 0; n < dim*dim; ++n) {\n"
21720cf1dd8SToby Isaac " const int offset = n*N_t;\n"
21820cf1dd8SToby Isaac " invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n"
21920cf1dd8SToby Isaac " }\n"
22020cf1dd8SToby Isaac " /* Load coefficients u_i for this cell */\n"
22120cf1dd8SToby Isaac " for (int n = 0; n < N_bt; ++n) {\n"
22220cf1dd8SToby Isaac " const int offset = n*N_t;\n"
22320cf1dd8SToby Isaac " u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n"
22420cf1dd8SToby Isaac " }\n",
2255f80ce2aSJacob Faibussowitsch &count));
22620cf1dd8SToby Isaac if (useAux) {
2279566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
22820cf1dd8SToby Isaac " /* Load coefficients a_i for this cell */\n"
22920cf1dd8SToby Isaac " /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n"
23020cf1dd8SToby Isaac " a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n",
2315f80ce2aSJacob Faibussowitsch &count));
23220cf1dd8SToby Isaac }
23320cf1dd8SToby Isaac /* Quadrature phase */
2349566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
23520cf1dd8SToby Isaac " barrier(CLK_LOCAL_MEM_FENCE);\n"
23620cf1dd8SToby Isaac "\n"
23720cf1dd8SToby Isaac " /* Map coefficients to values at quadrature points */\n"
23820cf1dd8SToby Isaac " for (int c = 0; c < N_sqc; ++c) {\n"
23920cf1dd8SToby Isaac " const int cell = c*N_bl*N_b + blqidx;\n"
24020cf1dd8SToby Isaac " const int fidx = (cell*N_q + qidx)*N_comp + cidx;\n",
2415f80ce2aSJacob Faibussowitsch &count));
24248a46eb9SPierre Jolivet if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s u[%d]; //[N_comp]; // $u(x_q)$, Value of the field at $x_q$\n", &count, numeric_str, N_c));
24348a46eb9SPierre Jolivet if (useFieldDer) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s%d gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n", &count, numeric_str, dim, N_c));
24448a46eb9SPierre Jolivet if (useFieldAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s a[%d]; //[1]; // $a(x_q)$, Value of the auxiliary fields at $x_q$\n", &count, numeric_str, 1));
24548a46eb9SPierre Jolivet if (useFieldDerAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " %s%d gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n", &count, numeric_str, dim, 1));
2469566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
24720cf1dd8SToby Isaac "\n"
24820cf1dd8SToby Isaac " for (int comp = 0; comp < N_comp; ++comp) {\n",
2495f80ce2aSJacob Faibussowitsch &count));
2509566063dSJacob Faibussowitsch if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " u[comp] = 0.0;\n", &count));
25120cf1dd8SToby Isaac if (useFieldDer) {
25220cf1dd8SToby Isaac switch (dim) {
253d71ae5a4SJacob Faibussowitsch case 1:
254d71ae5a4SJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0;\n", &count));
255d71ae5a4SJacob Faibussowitsch break;
256d71ae5a4SJacob Faibussowitsch case 2:
257d71ae5a4SJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count));
258d71ae5a4SJacob Faibussowitsch break;
259d71ae5a4SJacob Faibussowitsch case 3:
260d71ae5a4SJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count));
261d71ae5a4SJacob Faibussowitsch break;
26220cf1dd8SToby Isaac }
26320cf1dd8SToby Isaac }
2649371c9d4SSatish Balay PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " }\n", &count));
26548a46eb9SPierre Jolivet if (useFieldAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " a[0] = 0.0;\n", &count));
26620cf1dd8SToby Isaac if (useFieldDerAux) {
26720cf1dd8SToby Isaac switch (dim) {
268d71ae5a4SJacob Faibussowitsch case 1:
269d71ae5a4SJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0;\n", &count));
270d71ae5a4SJacob Faibussowitsch break;
271d71ae5a4SJacob Faibussowitsch case 2:
272d71ae5a4SJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count));
273d71ae5a4SJacob Faibussowitsch break;
274d71ae5a4SJacob Faibussowitsch case 3:
275d71ae5a4SJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count));
276d71ae5a4SJacob Faibussowitsch break;
27720cf1dd8SToby Isaac }
27820cf1dd8SToby Isaac }
2799566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
28020cf1dd8SToby Isaac " /* Get field and derivatives at this quadrature point */\n"
28120cf1dd8SToby Isaac " for (int i = 0; i < N_b; ++i) {\n"
28220cf1dd8SToby Isaac " for (int comp = 0; comp < N_comp; ++comp) {\n"
28320cf1dd8SToby Isaac " const int b = i*N_comp+comp;\n"
28420cf1dd8SToby Isaac " const int pidx = qidx*N_bt + b;\n"
28520cf1dd8SToby Isaac " const int uidx = cell*N_bt + b;\n"
28620cf1dd8SToby Isaac " %s%d realSpaceDer;\n\n",
2875f80ce2aSJacob Faibussowitsch &count, numeric_str, dim));
2889566063dSJacob Faibussowitsch if (useField) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " u[comp] += u_i[uidx]*phi_i[pidx];\n", &count));
28920cf1dd8SToby Isaac if (useFieldDer) {
29020cf1dd8SToby Isaac switch (dim) {
29120cf1dd8SToby Isaac case 2:
2929566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
29320cf1dd8SToby Isaac " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
29420cf1dd8SToby Isaac " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
29520cf1dd8SToby Isaac " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
29620cf1dd8SToby Isaac " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n",
2979371c9d4SSatish Balay &count));
2989371c9d4SSatish Balay break;
29920cf1dd8SToby Isaac case 3:
3009566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
30120cf1dd8SToby Isaac " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
30220cf1dd8SToby Isaac " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
30320cf1dd8SToby Isaac " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
30420cf1dd8SToby Isaac " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n"
30520cf1dd8SToby Isaac " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
30620cf1dd8SToby Isaac " gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n",
3079371c9d4SSatish Balay &count));
3089371c9d4SSatish Balay break;
30920cf1dd8SToby Isaac }
31020cf1dd8SToby Isaac }
3119566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
31220cf1dd8SToby Isaac " }\n"
31320cf1dd8SToby Isaac " }\n",
3145f80ce2aSJacob Faibussowitsch &count));
31548a46eb9SPierre Jolivet if (useFieldAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " a[0] += a_i[cell];\n", &count));
316362f43d5SPierre Jolivet /* Calculate residual at quadrature points: Should be generated by an weak form engine */
3179371c9d4SSatish Balay PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " /* Process values at quadrature points */\n", &count));
31820cf1dd8SToby Isaac switch (op) {
31920cf1dd8SToby Isaac case LAPLACIAN:
32048a46eb9SPierre Jolivet if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count));
32120cf1dd8SToby Isaac if (useF1) {
3229566063dSJacob Faibussowitsch if (useAux) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = a[0]*gradU[cidx];\n", &count));
3239566063dSJacob Faibussowitsch else PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = gradU[cidx];\n", &count));
32420cf1dd8SToby Isaac }
32520cf1dd8SToby Isaac break;
32620cf1dd8SToby Isaac case ELASTICITY:
3279566063dSJacob Faibussowitsch if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count));
32820cf1dd8SToby Isaac if (useF1) {
32920cf1dd8SToby Isaac switch (dim) {
33020cf1dd8SToby Isaac case 2:
3319566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
33220cf1dd8SToby Isaac " switch (cidx) {\n"
33320cf1dd8SToby Isaac " case 0:\n"
33420cf1dd8SToby Isaac " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n"
33520cf1dd8SToby Isaac " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n"
33620cf1dd8SToby Isaac " break;\n"
33720cf1dd8SToby Isaac " case 1:\n"
33820cf1dd8SToby Isaac " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n"
33920cf1dd8SToby Isaac " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n"
34020cf1dd8SToby Isaac " }\n",
3419371c9d4SSatish Balay &count));
3429371c9d4SSatish Balay break;
34320cf1dd8SToby Isaac case 3:
3449566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
34520cf1dd8SToby Isaac " switch (cidx) {\n"
34620cf1dd8SToby Isaac " case 0:\n"
34720cf1dd8SToby Isaac " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n"
34820cf1dd8SToby Isaac " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n"
34920cf1dd8SToby Isaac " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n"
35020cf1dd8SToby Isaac " break;\n"
35120cf1dd8SToby Isaac " case 1:\n"
35220cf1dd8SToby Isaac " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n"
35320cf1dd8SToby Isaac " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n"
35420cf1dd8SToby Isaac " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n"
35520cf1dd8SToby Isaac " break;\n"
35620cf1dd8SToby Isaac " case 2:\n"
35720cf1dd8SToby Isaac " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n"
35820cf1dd8SToby Isaac " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n"
35920cf1dd8SToby Isaac " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n"
36020cf1dd8SToby Isaac " }\n",
3619371c9d4SSatish Balay &count));
36220cf1dd8SToby Isaac break;
3639371c9d4SSatish Balay }
3649371c9d4SSatish Balay }
3659371c9d4SSatish Balay break;
366d71ae5a4SJacob Faibussowitsch default:
367d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PDE operator %d is not supported", op);
36820cf1dd8SToby Isaac }
3699566063dSJacob Faibussowitsch if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] *= detJ[cell]*w;\n", &count));
37020cf1dd8SToby Isaac if (useF1) {
37120cf1dd8SToby Isaac switch (dim) {
372d71ae5a4SJacob Faibussowitsch case 1:
373d71ae5a4SJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx].x *= detJ[cell]*w;\n", &count));
374d71ae5a4SJacob Faibussowitsch break;
375d71ae5a4SJacob Faibussowitsch case 2:
376d71ae5a4SJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count));
377d71ae5a4SJacob Faibussowitsch break;
378d71ae5a4SJacob Faibussowitsch case 3:
379d71ae5a4SJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count));
380d71ae5a4SJacob Faibussowitsch break;
38120cf1dd8SToby Isaac }
38220cf1dd8SToby Isaac }
38320cf1dd8SToby Isaac /* Thread transpose */
3849566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
38520cf1dd8SToby Isaac " }\n\n"
38620cf1dd8SToby Isaac " /* ==== TRANSPOSE THREADS ==== */\n"
38720cf1dd8SToby Isaac " barrier(CLK_LOCAL_MEM_FENCE);\n\n",
3885f80ce2aSJacob Faibussowitsch &count));
38920cf1dd8SToby Isaac /* Basis phase */
3909566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
39120cf1dd8SToby Isaac " /* Map values at quadrature points to coefficients */\n"
39220cf1dd8SToby Isaac " for (int c = 0; c < N_sbc; ++c) {\n"
39320cf1dd8SToby Isaac " const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n"
39420cf1dd8SToby Isaac "\n"
39520cf1dd8SToby Isaac " e_i = 0.0;\n"
39620cf1dd8SToby Isaac " for (int q = 0; q < N_q; ++q) {\n"
39720cf1dd8SToby Isaac " const int pidx = q*N_bt + bidx;\n"
39820cf1dd8SToby Isaac " const int fidx = (cell*N_q + q)*N_comp + cidx;\n"
39920cf1dd8SToby Isaac " %s%d realSpaceDer;\n\n",
4005f80ce2aSJacob Faibussowitsch &count, numeric_str, dim));
40120cf1dd8SToby Isaac
4029566063dSJacob Faibussowitsch if (useF0) PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " e_i += phi_i[pidx]*f_0[fidx];\n", &count));
40320cf1dd8SToby Isaac if (useF1) {
40420cf1dd8SToby Isaac switch (dim) {
40520cf1dd8SToby Isaac case 2:
4069566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
40720cf1dd8SToby Isaac " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
40820cf1dd8SToby Isaac " e_i += realSpaceDer.x*f_1[fidx].x;\n"
40920cf1dd8SToby Isaac " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
41020cf1dd8SToby Isaac " e_i += realSpaceDer.y*f_1[fidx].y;\n",
4119371c9d4SSatish Balay &count));
4129371c9d4SSatish Balay break;
41320cf1dd8SToby Isaac case 3:
4149566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
41520cf1dd8SToby Isaac " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
41620cf1dd8SToby Isaac " e_i += realSpaceDer.x*f_1[fidx].x;\n"
41720cf1dd8SToby Isaac " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
41820cf1dd8SToby Isaac " e_i += realSpaceDer.y*f_1[fidx].y;\n"
41920cf1dd8SToby Isaac " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
42020cf1dd8SToby Isaac " e_i += realSpaceDer.z*f_1[fidx].z;\n",
4219371c9d4SSatish Balay &count));
4229371c9d4SSatish Balay break;
42320cf1dd8SToby Isaac }
42420cf1dd8SToby Isaac }
4259566063dSJacob Faibussowitsch PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
42620cf1dd8SToby Isaac " }\n"
42720cf1dd8SToby Isaac " /* Write element vector for N_{cbc} cells at a time */\n"
42820cf1dd8SToby Isaac " elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n"
42920cf1dd8SToby Isaac " }\n"
43020cf1dd8SToby Isaac " /* ==== Could do one write per batch ==== */\n"
43120cf1dd8SToby Isaac " }\n"
43220cf1dd8SToby Isaac " return;\n"
43320cf1dd8SToby Isaac "}\n",
4345f80ce2aSJacob Faibussowitsch &count));
4353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
43620cf1dd8SToby Isaac }
43720cf1dd8SToby Isaac
PetscFEOpenCLGetIntegrationKernel(PetscFE fem,PetscBool useAux,cl_program * ocl_prog,cl_kernel * ocl_kernel)438d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel)
439d71ae5a4SJacob Faibussowitsch {
44020cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
44120cf1dd8SToby Isaac PetscInt dim, N_bl;
44220cf1dd8SToby Isaac PetscBool flg;
44320cf1dd8SToby Isaac char *buffer;
44420cf1dd8SToby Isaac size_t len;
44520cf1dd8SToby Isaac char errMsg[8192];
4462da392ccSBarry Smith cl_int err;
44720cf1dd8SToby Isaac
44820cf1dd8SToby Isaac PetscFunctionBegin;
4499566063dSJacob Faibussowitsch PetscCall(PetscFEGetSpatialDimension(fem, &dim));
4509566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(8192, &buffer));
4519566063dSJacob Faibussowitsch PetscCall(PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL));
4529566063dSJacob Faibussowitsch PetscCall(PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl));
4539566063dSJacob Faibussowitsch PetscCall(PetscOptionsHasName(((PetscObject)fem)->options, ((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg));
4549566063dSJacob Faibussowitsch if (flg) PetscCall(PetscPrintf(PetscObjectComm((PetscObject)fem), "OpenCL FE Integration Kernel:\n%s\n", buffer));
4559566063dSJacob Faibussowitsch PetscCall(PetscStrlen(buffer, &len));
4569371c9d4SSatish Balay *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **)&buffer, &len, &err);
4579371c9d4SSatish Balay PetscCall(err);
4582da392ccSBarry Smith err = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL);
4592da392ccSBarry Smith if (err != CL_SUCCESS) {
4602f613bf5SBarry Smith err = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192 * sizeof(char), &errMsg, NULL);
46198921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg);
46220cf1dd8SToby Isaac }
4639566063dSJacob Faibussowitsch PetscCall(PetscFree(buffer));
464d0609cedSBarry Smith *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &err);
4653ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
46620cf1dd8SToby Isaac }
46720cf1dd8SToby Isaac
PetscFEOpenCLCalculateGrid(PetscFE fem,PetscInt N,PetscInt blockSize,size_t * x,size_t * y,size_t * z)468d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z)
469d71ae5a4SJacob Faibussowitsch {
47020cf1dd8SToby Isaac const PetscInt Nblocks = N / blockSize;
47120cf1dd8SToby Isaac
47220cf1dd8SToby Isaac PetscFunctionBegin;
4735f80ce2aSJacob Faibussowitsch PetscCheck(!(N % blockSize), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
47420cf1dd8SToby Isaac *z = 1;
475623c9f23SSatish Balay *y = 1;
47620cf1dd8SToby Isaac for (*x = (size_t)(PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) {
47720cf1dd8SToby Isaac *y = Nblocks / *x;
478526996e7SStefano Zampini if (*x * *y == (size_t)Nblocks) break;
47920cf1dd8SToby Isaac }
48063a3b9bcSJacob Faibussowitsch PetscCheck(*x * *y == (size_t)Nblocks, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %" PetscInt_FMT " with block size %" PetscInt_FMT, N, blockSize);
4813ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
48220cf1dd8SToby Isaac }
48320cf1dd8SToby Isaac
PetscFEOpenCLLogResidual(PetscFE fem,PetscLogDouble time,PetscLogDouble flops)484d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops)
485d71ae5a4SJacob Faibussowitsch {
486b665b14eSToby Isaac PetscLogHandler h;
48720cf1dd8SToby Isaac
48820cf1dd8SToby Isaac PetscFunctionBegin;
489b665b14eSToby Isaac PetscCall(PetscLogGetDefaultHandler(&h));
490b665b14eSToby Isaac if (h) {
491b665b14eSToby Isaac PetscEventPerfInfo *eventInfo;
492b665b14eSToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
493b665b14eSToby Isaac
494dff009beSToby Isaac PetscCall(PetscLogHandlerGetEventPerfInfo(h, PETSC_DEFAULT, ocl->residualEvent, &eventInfo));
495b665b14eSToby Isaac eventInfo->count++;
496b665b14eSToby Isaac eventInfo->time += time;
497b665b14eSToby Isaac eventInfo->flops += flops;
498b665b14eSToby Isaac }
4993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
50020cf1dd8SToby Isaac }
50120cf1dd8SToby Isaac
PetscFEIntegrateResidual_OpenCL(PetscDS prob,PetscFormKey key,PetscInt Ne,PetscFEGeom * cgeom,const PetscScalar coefficients[],const PetscScalar coefficients_t[],PetscDS probAux,const PetscScalar coefficientsAux[],PetscReal t,PetscScalar elemVec[])502d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscFormKey key, PetscInt Ne, PetscFEGeom *cgeom, const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[])
503d71ae5a4SJacob Faibussowitsch {
50420cf1dd8SToby Isaac /* Nbc = batchSize */
505849189efSMatthew G. Knepley PetscFE fem;
506849189efSMatthew G. Knepley PetscFE_OpenCL *ocl;
507*2192575eSBarry Smith PetscPointFn *f0_func;
508*2192575eSBarry Smith PetscPointFn *f1_func;
50920cf1dd8SToby Isaac PetscQuadrature q;
51020cf1dd8SToby Isaac PetscInt dim, qNc;
51120cf1dd8SToby Isaac PetscInt N_b; /* The number of basis functions */
51220cf1dd8SToby Isaac PetscInt N_comp; /* The number of basis function components */
51320cf1dd8SToby Isaac PetscInt N_bt; /* The total number of scalar basis functions */
51420cf1dd8SToby Isaac PetscInt N_q; /* The number of quadrature points */
51520cf1dd8SToby Isaac PetscInt N_bst; /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
51620cf1dd8SToby Isaac PetscInt N_t; /* The number of threads, N_bst * N_bl */
51720cf1dd8SToby Isaac PetscInt N_bl; /* The number of blocks */
51820cf1dd8SToby Isaac PetscInt N_bc; /* The batch size, N_bl*N_q*N_b */
51920cf1dd8SToby Isaac PetscInt N_cb; /* The number of batches */
5206528b96dSMatthew G. Knepley const PetscInt field = key.field;
52120cf1dd8SToby Isaac PetscInt numFlops, f0Flops = 0, f1Flops = 0;
52220cf1dd8SToby Isaac PetscBool useAux = probAux ? PETSC_TRUE : PETSC_FALSE;
52320cf1dd8SToby Isaac PetscBool useField = PETSC_FALSE;
52420cf1dd8SToby Isaac PetscBool useFieldDer = PETSC_TRUE;
52520cf1dd8SToby Isaac PetscBool useF0 = PETSC_TRUE;
52620cf1dd8SToby Isaac PetscBool useF1 = PETSC_TRUE;
52720cf1dd8SToby Isaac /* OpenCL variables */
52820cf1dd8SToby Isaac cl_program ocl_prog;
52920cf1dd8SToby Isaac cl_kernel ocl_kernel;
53020cf1dd8SToby Isaac cl_event ocl_ev; /* The event for tracking kernel execution */
53120cf1dd8SToby Isaac cl_ulong ns_start; /* Nanoseconds counter on GPU at kernel start */
53220cf1dd8SToby Isaac cl_ulong ns_end; /* Nanoseconds counter on GPU at kernel stop */
53320cf1dd8SToby Isaac cl_mem o_jacobianInverses, o_jacobianDeterminants;
53420cf1dd8SToby Isaac cl_mem o_coefficients, o_coefficientsAux, o_elemVec;
53520cf1dd8SToby Isaac float *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL;
53620cf1dd8SToby Isaac double *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL;
53720cf1dd8SToby Isaac PetscReal *r_invJ = NULL, *r_detJ = NULL;
53820cf1dd8SToby Isaac void *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ;
53920cf1dd8SToby Isaac size_t local_work_size[3], global_work_size[3];
54020cf1dd8SToby Isaac size_t realSize, x, y, z;
54120cf1dd8SToby Isaac const PetscReal *points, *weights;
542d0609cedSBarry Smith int err;
54320cf1dd8SToby Isaac
54420cf1dd8SToby Isaac PetscFunctionBegin;
5459566063dSJacob Faibussowitsch PetscCall(PetscDSGetDiscretization(prob, field, (PetscObject *)&fem));
546849189efSMatthew G. Knepley ocl = (PetscFE_OpenCL *)fem->data;
5479371c9d4SSatish Balay if (!Ne) {
5489371c9d4SSatish Balay PetscCall(PetscFEOpenCLLogResidual(fem, 0.0, 0.0));
5493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
5509371c9d4SSatish Balay }
5519566063dSJacob Faibussowitsch PetscCall(PetscFEGetSpatialDimension(fem, &dim));
5529566063dSJacob Faibussowitsch PetscCall(PetscFEGetQuadrature(fem, &q));
5539566063dSJacob Faibussowitsch PetscCall(PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights));
55463a3b9bcSJacob Faibussowitsch PetscCheck(qNc == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %" PetscInt_FMT " components", qNc);
5559566063dSJacob Faibussowitsch PetscCall(PetscFEGetDimension(fem, &N_b));
5569566063dSJacob Faibussowitsch PetscCall(PetscFEGetNumComponents(fem, &N_comp));
5579566063dSJacob Faibussowitsch PetscCall(PetscDSGetResidual(prob, field, &f0_func, &f1_func));
5589566063dSJacob Faibussowitsch PetscCall(PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb));
55920cf1dd8SToby Isaac N_bt = N_b * N_comp;
56020cf1dd8SToby Isaac N_bst = N_bt * N_q;
56120cf1dd8SToby Isaac N_t = N_bst * N_bl;
5625f80ce2aSJacob Faibussowitsch PetscCheck(N_bc * N_comp == N_t, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp);
56320cf1dd8SToby Isaac /* Calculate layout */
56420cf1dd8SToby Isaac if (Ne % (N_cb * N_bc)) { /* Remainder cells */
5659566063dSJacob Faibussowitsch PetscCall(PetscFEIntegrateResidual_Basic(prob, key, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec));
5663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
56720cf1dd8SToby Isaac }
5689566063dSJacob Faibussowitsch PetscCall(PetscFEOpenCLCalculateGrid(fem, Ne, N_cb * N_bc, &x, &y, &z));
56920cf1dd8SToby Isaac local_work_size[0] = N_bc * N_comp;
57020cf1dd8SToby Isaac local_work_size[1] = 1;
57120cf1dd8SToby Isaac local_work_size[2] = 1;
57220cf1dd8SToby Isaac global_work_size[0] = x * local_work_size[0];
57320cf1dd8SToby Isaac global_work_size[1] = y * local_work_size[1];
57420cf1dd8SToby Isaac global_work_size[2] = z * local_work_size[2];
57563a3b9bcSJacob Faibussowitsch PetscCall(PetscInfo(fem, "GPU layout grid(%zu,%zu,%zu) block(%zu,%zu,%zu) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb));
5769566063dSJacob Faibussowitsch PetscCall(PetscInfo(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb));
57720cf1dd8SToby Isaac /* Generate code */
57820cf1dd8SToby Isaac if (probAux) {
57920cf1dd8SToby Isaac PetscSpace P;
58020cf1dd8SToby Isaac PetscInt NfAux, order, f;
58120cf1dd8SToby Isaac
5829566063dSJacob Faibussowitsch PetscCall(PetscDSGetNumFields(probAux, &NfAux));
58320cf1dd8SToby Isaac for (f = 0; f < NfAux; ++f) {
58420cf1dd8SToby Isaac PetscFE feAux;
58520cf1dd8SToby Isaac
5869566063dSJacob Faibussowitsch PetscCall(PetscDSGetDiscretization(probAux, f, (PetscObject *)&feAux));
5879566063dSJacob Faibussowitsch PetscCall(PetscFEGetBasisSpace(feAux, &P));
5889566063dSJacob Faibussowitsch PetscCall(PetscSpaceGetDegree(P, &order, NULL));
5895f80ce2aSJacob Faibussowitsch PetscCheck(order <= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields");
59020cf1dd8SToby Isaac }
59120cf1dd8SToby Isaac }
5929566063dSJacob Faibussowitsch PetscCall(PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel));
59320cf1dd8SToby Isaac /* Create buffers on the device and send data over */
5949566063dSJacob Faibussowitsch PetscCall(PetscDataTypeGetSize(ocl->realType, &realSize));
5955f80ce2aSJacob Faibussowitsch PetscCheck(cgeom->numPoints <= 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only support affine geometry for OpenCL integration right now");
59620cf1dd8SToby Isaac if (sizeof(PetscReal) != realSize) {
59720cf1dd8SToby Isaac switch (ocl->realType) {
5989371c9d4SSatish Balay case PETSC_FLOAT: {
59920cf1dd8SToby Isaac PetscInt c, b, d;
60020cf1dd8SToby Isaac
6019566063dSJacob Faibussowitsch PetscCall(PetscMalloc4(Ne * N_bt, &f_coeff, Ne, &f_coeffAux, Ne * dim * dim, &f_invJ, Ne, &f_detJ));
60220cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) {
6037be5e748SToby Isaac f_detJ[c] = (float)cgeom->detJ[c];
604ad540459SPierre Jolivet for (d = 0; d < dim * dim; ++d) f_invJ[c * dim * dim + d] = (float)cgeom->invJ[c * dim * dim + d];
605ad540459SPierre Jolivet for (b = 0; b < N_bt; ++b) f_coeff[c * N_bt + b] = (float)coefficients[c * N_bt + b];
60620cf1dd8SToby Isaac }
60720cf1dd8SToby Isaac if (coefficientsAux) { /* Assume P0 */
608ad540459SPierre Jolivet for (c = 0; c < Ne; ++c) f_coeffAux[c] = (float)coefficientsAux[c];
60920cf1dd8SToby Isaac }
61020cf1dd8SToby Isaac oclCoeff = (void *)f_coeff;
61120cf1dd8SToby Isaac if (coefficientsAux) {
61220cf1dd8SToby Isaac oclCoeffAux = (void *)f_coeffAux;
61320cf1dd8SToby Isaac } else {
61420cf1dd8SToby Isaac oclCoeffAux = NULL;
61520cf1dd8SToby Isaac }
61620cf1dd8SToby Isaac oclInvJ = (void *)f_invJ;
61720cf1dd8SToby Isaac oclDetJ = (void *)f_detJ;
6189371c9d4SSatish Balay } break;
6199371c9d4SSatish Balay case PETSC_DOUBLE: {
62020cf1dd8SToby Isaac PetscInt c, b, d;
62120cf1dd8SToby Isaac
6229566063dSJacob Faibussowitsch PetscCall(PetscMalloc4(Ne * N_bt, &d_coeff, Ne, &d_coeffAux, Ne * dim * dim, &d_invJ, Ne, &d_detJ));
62320cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) {
6247be5e748SToby Isaac d_detJ[c] = (double)cgeom->detJ[c];
625ad540459SPierre Jolivet for (d = 0; d < dim * dim; ++d) d_invJ[c * dim * dim + d] = (double)cgeom->invJ[c * dim * dim + d];
626ad540459SPierre Jolivet for (b = 0; b < N_bt; ++b) d_coeff[c * N_bt + b] = (double)coefficients[c * N_bt + b];
62720cf1dd8SToby Isaac }
62820cf1dd8SToby Isaac if (coefficientsAux) { /* Assume P0 */
629ad540459SPierre Jolivet for (c = 0; c < Ne; ++c) d_coeffAux[c] = (double)coefficientsAux[c];
63020cf1dd8SToby Isaac }
63120cf1dd8SToby Isaac oclCoeff = (void *)d_coeff;
63220cf1dd8SToby Isaac if (coefficientsAux) {
63320cf1dd8SToby Isaac oclCoeffAux = (void *)d_coeffAux;
63420cf1dd8SToby Isaac } else {
63520cf1dd8SToby Isaac oclCoeffAux = NULL;
63620cf1dd8SToby Isaac }
63720cf1dd8SToby Isaac oclInvJ = (void *)d_invJ;
63820cf1dd8SToby Isaac oclDetJ = (void *)d_detJ;
6399371c9d4SSatish Balay } break;
640d71ae5a4SJacob Faibussowitsch default:
641d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
64220cf1dd8SToby Isaac }
64320cf1dd8SToby Isaac } else {
64420cf1dd8SToby Isaac PetscInt c, d;
64520cf1dd8SToby Isaac
6469566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(Ne * dim * dim, &r_invJ, Ne, &r_detJ));
64720cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) {
6487be5e748SToby Isaac r_detJ[c] = cgeom->detJ[c];
649ad540459SPierre Jolivet for (d = 0; d < dim * dim; ++d) r_invJ[c * dim * dim + d] = cgeom->invJ[c * dim * dim + d];
65020cf1dd8SToby Isaac }
65120cf1dd8SToby Isaac oclCoeff = (void *)coefficients;
65220cf1dd8SToby Isaac oclCoeffAux = (void *)coefficientsAux;
65320cf1dd8SToby Isaac oclInvJ = (void *)r_invJ;
65420cf1dd8SToby Isaac oclDetJ = (void *)r_detJ;
65520cf1dd8SToby Isaac }
656d0609cedSBarry Smith o_coefficients = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * N_bt * realSize, oclCoeff, &err);
65720cf1dd8SToby Isaac if (coefficientsAux) {
658d0609cedSBarry Smith o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclCoeffAux, &err);
65920cf1dd8SToby Isaac } else {
660d0609cedSBarry Smith o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY, Ne * realSize, oclCoeffAux, &err);
66120cf1dd8SToby Isaac }
662d0609cedSBarry Smith o_jacobianInverses = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * dim * dim * realSize, oclInvJ, &err);
663d0609cedSBarry Smith o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclDetJ, &err);
664d0609cedSBarry Smith o_elemVec = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY, Ne * N_bt * realSize, NULL, &err);
66520cf1dd8SToby Isaac /* Kernel launch */
6669566063dSJacob Faibussowitsch PetscCall(clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void *)&N_cb));
6679566063dSJacob Faibussowitsch PetscCall(clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void *)&o_coefficients));
6689566063dSJacob Faibussowitsch PetscCall(clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void *)&o_coefficientsAux));
6699566063dSJacob Faibussowitsch PetscCall(clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void *)&o_jacobianInverses));
6709566063dSJacob Faibussowitsch PetscCall(clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void *)&o_jacobianDeterminants));
6719566063dSJacob Faibussowitsch PetscCall(clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void *)&o_elemVec));
6729566063dSJacob Faibussowitsch PetscCall(clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev));
67320cf1dd8SToby Isaac /* Read data back from device */
67420cf1dd8SToby Isaac if (sizeof(PetscReal) != realSize) {
67520cf1dd8SToby Isaac switch (ocl->realType) {
6769371c9d4SSatish Balay case PETSC_FLOAT: {
67720cf1dd8SToby Isaac float *elem;
67820cf1dd8SToby Isaac PetscInt c, b;
67920cf1dd8SToby Isaac
6809566063dSJacob Faibussowitsch PetscCall(PetscFree4(f_coeff, f_coeffAux, f_invJ, f_detJ));
6819566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(Ne * N_bt, &elem));
6829566063dSJacob Faibussowitsch PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elem, 0, NULL, NULL));
68320cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) {
684ad540459SPierre Jolivet for (b = 0; b < N_bt; ++b) elemVec[c * N_bt + b] = (PetscScalar)elem[c * N_bt + b];
68520cf1dd8SToby Isaac }
6869566063dSJacob Faibussowitsch PetscCall(PetscFree(elem));
6879371c9d4SSatish Balay } break;
6889371c9d4SSatish Balay case PETSC_DOUBLE: {
68920cf1dd8SToby Isaac double *elem;
69020cf1dd8SToby Isaac PetscInt c, b;
69120cf1dd8SToby Isaac
6929566063dSJacob Faibussowitsch PetscCall(PetscFree4(d_coeff, d_coeffAux, d_invJ, d_detJ));
6939566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(Ne * N_bt, &elem));
6949566063dSJacob Faibussowitsch PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elem, 0, NULL, NULL));
69520cf1dd8SToby Isaac for (c = 0; c < Ne; ++c) {
696ad540459SPierre Jolivet for (b = 0; b < N_bt; ++b) elemVec[c * N_bt + b] = (PetscScalar)elem[c * N_bt + b];
69720cf1dd8SToby Isaac }
6989566063dSJacob Faibussowitsch PetscCall(PetscFree(elem));
6999371c9d4SSatish Balay } break;
700d71ae5a4SJacob Faibussowitsch default:
701d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
70220cf1dd8SToby Isaac }
70320cf1dd8SToby Isaac } else {
7049566063dSJacob Faibussowitsch PetscCall(PetscFree2(r_invJ, r_detJ));
7059566063dSJacob Faibussowitsch PetscCall(clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elemVec, 0, NULL, NULL));
70620cf1dd8SToby Isaac }
70720cf1dd8SToby Isaac /* Log performance */
7089566063dSJacob Faibussowitsch PetscCall(clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL));
7099566063dSJacob Faibussowitsch PetscCall(clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL));
71020cf1dd8SToby Isaac f0Flops = 0;
71120cf1dd8SToby Isaac switch (ocl->op) {
712d71ae5a4SJacob Faibussowitsch case LAPLACIAN:
713d71ae5a4SJacob Faibussowitsch f1Flops = useAux ? dim : 0;
714d71ae5a4SJacob Faibussowitsch break;
715d71ae5a4SJacob Faibussowitsch case ELASTICITY:
716d71ae5a4SJacob Faibussowitsch f1Flops = 2 * dim * dim;
717d71ae5a4SJacob Faibussowitsch break;
71820cf1dd8SToby Isaac }
7199371c9d4SSatish Balay numFlops = Ne * (N_q * (N_b * N_comp * ((useField ? 2 : 0) + (useFieldDer ? 2 * dim * (dim + 1) : 0))
72020cf1dd8SToby Isaac /*+
72120cf1dd8SToby Isaac N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/
7229371c9d4SSatish Balay + N_comp * ((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2 * dim : 0))) +
72320cf1dd8SToby Isaac N_b * ((useF0 ? 2 : 0) + (useF1 ? 2 * dim * (dim + 1) : 0)));
7249566063dSJacob Faibussowitsch PetscCall(PetscFEOpenCLLogResidual(fem, (ns_end - ns_start) * 1.0e-9, numFlops));
72520cf1dd8SToby Isaac /* Cleanup */
7269566063dSJacob Faibussowitsch PetscCall(clReleaseMemObject(o_coefficients));
7279566063dSJacob Faibussowitsch PetscCall(clReleaseMemObject(o_coefficientsAux));
7289566063dSJacob Faibussowitsch PetscCall(clReleaseMemObject(o_jacobianInverses));
7299566063dSJacob Faibussowitsch PetscCall(clReleaseMemObject(o_jacobianDeterminants));
7309566063dSJacob Faibussowitsch PetscCall(clReleaseMemObject(o_elemVec));
7319566063dSJacob Faibussowitsch PetscCall(clReleaseKernel(ocl_kernel));
7329566063dSJacob Faibussowitsch PetscCall(clReleaseProgram(ocl_prog));
7333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
73420cf1dd8SToby Isaac }
73520cf1dd8SToby Isaac
736526996e7SStefano Zampini PETSC_INTERN PetscErrorCode PetscFESetUp_Basic(PetscFE);
737ce78bad3SBarry Smith PETSC_INTERN PetscErrorCode PetscFEComputeTabulation_Basic(PetscFE, PetscInt, const PetscReal[], PetscInt, PetscTabulation);
7382e47ffbbSToby Isaac
PetscFEInitialize_OpenCL(PetscFE fem)739d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem)
740d71ae5a4SJacob Faibussowitsch {
74120cf1dd8SToby Isaac PetscFunctionBegin;
74220cf1dd8SToby Isaac fem->ops->setfromoptions = NULL;
74320cf1dd8SToby Isaac fem->ops->setup = PetscFESetUp_Basic;
74420cf1dd8SToby Isaac fem->ops->view = NULL;
74520cf1dd8SToby Isaac fem->ops->destroy = PetscFEDestroy_OpenCL;
74620cf1dd8SToby Isaac fem->ops->getdimension = PetscFEGetDimension_Basic;
747ce78bad3SBarry Smith fem->ops->computetabulation = PetscFEComputeTabulation_Basic;
74820cf1dd8SToby Isaac fem->ops->integrateresidual = PetscFEIntegrateResidual_OpenCL;
74920cf1dd8SToby Isaac fem->ops->integratebdresidual = NULL /* PetscFEIntegrateBdResidual_OpenCL */;
75020cf1dd8SToby Isaac fem->ops->integratejacobianaction = NULL /* PetscFEIntegrateJacobianAction_OpenCL */;
75120cf1dd8SToby Isaac fem->ops->integratejacobian = PetscFEIntegrateJacobian_Basic;
7523ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
75320cf1dd8SToby Isaac }
75420cf1dd8SToby Isaac
75520cf1dd8SToby Isaac /*MC
756dce8aebaSBarry Smith PETSCFEOPENCL = "opencl" - A `PetscFEType` that integrates using a vectorized OpenCL implementation
75720cf1dd8SToby Isaac
75820cf1dd8SToby Isaac Level: intermediate
75920cf1dd8SToby Isaac
760db781477SPatrick Sanan .seealso: `PetscFEType`, `PetscFECreate()`, `PetscFESetType()`
76120cf1dd8SToby Isaac M*/
76220cf1dd8SToby Isaac
PetscFECreate_OpenCL(PetscFE fem)763d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem)
764d71ae5a4SJacob Faibussowitsch {
76520cf1dd8SToby Isaac PetscFE_OpenCL *ocl;
76620cf1dd8SToby Isaac cl_uint num_platforms;
76720cf1dd8SToby Isaac cl_platform_id platform_ids[42];
76820cf1dd8SToby Isaac cl_uint num_devices;
76920cf1dd8SToby Isaac cl_device_id device_ids[42];
7702da392ccSBarry Smith cl_int err;
77120cf1dd8SToby Isaac
77220cf1dd8SToby Isaac PetscFunctionBegin;
77320cf1dd8SToby Isaac PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
7744dfa11a4SJacob Faibussowitsch PetscCall(PetscNew(&ocl));
77520cf1dd8SToby Isaac fem->data = ocl;
77620cf1dd8SToby Isaac
77720cf1dd8SToby Isaac /* Init Platform */
7789566063dSJacob Faibussowitsch PetscCall(clGetPlatformIDs(42, platform_ids, &num_platforms));
77928b400f6SJacob Faibussowitsch PetscCheck(num_platforms, PetscObjectComm((PetscObject)fem), PETSC_ERR_SUP, "No OpenCL platform found.");
78020cf1dd8SToby Isaac ocl->pf_id = platform_ids[0];
78120cf1dd8SToby Isaac /* Init Device */
7829566063dSJacob Faibussowitsch PetscCall(clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices));
78328b400f6SJacob Faibussowitsch PetscCheck(num_devices, PetscObjectComm((PetscObject)fem), PETSC_ERR_SUP, "No OpenCL device found.");
78420cf1dd8SToby Isaac ocl->dev_id = device_ids[0];
78520cf1dd8SToby Isaac /* Create context with one command queue */
786f4f49eeaSPierre Jolivet ocl->ctx_id = clCreateContext(0, 1, &ocl->dev_id, NULL, NULL, &err);
7879371c9d4SSatish Balay PetscCall(err);
7889371c9d4SSatish Balay ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &err);
7899371c9d4SSatish Balay PetscCall(err);
79020cf1dd8SToby Isaac /* Types */
79120cf1dd8SToby Isaac ocl->realType = PETSC_FLOAT;
79220cf1dd8SToby Isaac /* Register events */
7939566063dSJacob Faibussowitsch PetscCall(PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent));
79420cf1dd8SToby Isaac /* Equation handling */
79520cf1dd8SToby Isaac ocl->op = LAPLACIAN;
79620cf1dd8SToby Isaac
7979566063dSJacob Faibussowitsch PetscCall(PetscFEInitialize_OpenCL(fem));
7983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
79920cf1dd8SToby Isaac }
80020cf1dd8SToby Isaac
8012b99622eSMatthew G. Knepley /*@
802dce8aebaSBarry Smith PetscFEOpenCLSetRealType - Set the scalar type for running on the OpenCL accelerator
8032b99622eSMatthew G. Knepley
8042b99622eSMatthew G. Knepley Input Parameters:
805dce8aebaSBarry Smith + fem - The `PetscFE`
8062b99622eSMatthew G. Knepley - realType - The scalar type
8072b99622eSMatthew G. Knepley
8082b99622eSMatthew G. Knepley Level: developer
8092b99622eSMatthew G. Knepley
810dce8aebaSBarry Smith .seealso: `PetscFE`, `PetscFEOpenCLGetRealType()`
8112b99622eSMatthew G. Knepley @*/
PetscFEOpenCLSetRealType(PetscFE fem,PetscDataType realType)812d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType)
813d71ae5a4SJacob Faibussowitsch {
81420cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
81520cf1dd8SToby Isaac
81620cf1dd8SToby Isaac PetscFunctionBegin;
81720cf1dd8SToby Isaac PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
81820cf1dd8SToby Isaac ocl->realType = realType;
8193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
82020cf1dd8SToby Isaac }
82120cf1dd8SToby Isaac
8222b99622eSMatthew G. Knepley /*@
823dce8aebaSBarry Smith PetscFEOpenCLGetRealType - Get the scalar type for running on the OpenCL accelerator
8242b99622eSMatthew G. Knepley
8252b99622eSMatthew G. Knepley Input Parameter:
826dce8aebaSBarry Smith . fem - The `PetscFE`
8272b99622eSMatthew G. Knepley
8282b99622eSMatthew G. Knepley Output Parameter:
8292b99622eSMatthew G. Knepley . realType - The scalar type
8302b99622eSMatthew G. Knepley
8312b99622eSMatthew G. Knepley Level: developer
8322b99622eSMatthew G. Knepley
833dce8aebaSBarry Smith .seealso: `PetscFE`, `PetscFEOpenCLSetRealType()`
8342b99622eSMatthew G. Knepley @*/
PetscFEOpenCLGetRealType(PetscFE fem,PetscDataType * realType)835d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType)
836d71ae5a4SJacob Faibussowitsch {
83720cf1dd8SToby Isaac PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
83820cf1dd8SToby Isaac
83920cf1dd8SToby Isaac PetscFunctionBegin;
84020cf1dd8SToby Isaac PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
8414f572ea9SToby Isaac PetscAssertPointer(realType, 2);
84220cf1dd8SToby Isaac *realType = ocl->realType;
8433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
84420cf1dd8SToby Isaac }
84520cf1dd8SToby Isaac
84620cf1dd8SToby Isaac #endif /* PETSC_HAVE_OPENCL */
847