1 #include <petsc/private/petscfeimpl.h> /*I "petscfe.h" I*/ 2 3 #if defined(PETSC_HAVE_OPENCL) 4 5 static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem) 6 { 7 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 8 PetscErrorCode ierr; 9 10 PetscFunctionBegin; 11 ierr = clReleaseCommandQueue(ocl->queue_id);CHKERRQ(ierr); 12 ocl->queue_id = 0; 13 ierr = clReleaseContext(ocl->ctx_id);CHKERRQ(ierr); 14 ocl->ctx_id = 0; 15 ierr = PetscFree(ocl);CHKERRQ(ierr); 16 PetscFunctionReturn(0); 17 } 18 19 #define STRING_ERROR_CHECK(MSG) do {CHKERRQ(ierr); string_tail += count; if (string_tail == end_of_buffer) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, MSG);} while(0) 20 enum {LAPLACIAN = 0, ELASTICITY = 1}; 21 22 /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */ 23 /* dim Number of spatial dimensions: 2 */ 24 /* N_b Number of basis functions: generated */ 25 /* N_{bt} Number of total basis functions: N_b * N_{comp} */ 26 /* N_q Number of quadrature points: generated */ 27 /* N_{bs} Number of block cells LCM(N_b, N_q) */ 28 /* N_{bst} Number of block cell components LCM(N_{bt}, N_q) */ 29 /* N_{bl} Number of concurrent blocks generated */ 30 /* N_t Number of threads: N_{bl} * N_{bs} */ 31 /* N_{cbc} Number of concurrent basis cells: N_{bl} * N_q */ 32 /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b */ 33 /* N_{sbc} Number of serial basis cells: N_{bs} / N_q */ 34 /* N_{sqc} Number of serial quadrature cells: N_{bs} / N_b */ 35 /* N_{cb} Number of serial cell batches: input */ 36 /* N_c Number of total cells: N_{cb}*N_{t}/N_{comp} */ 37 static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl) 38 { 39 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 40 PetscQuadrature q; 41 char *string_tail = *string_buffer; 42 char *end_of_buffer = *string_buffer + buffer_length; 43 char float_str[] = "float", double_str[] = "double"; 44 char *numeric_str = &(float_str[0]); 45 PetscInt op = ocl->op; 46 PetscBool useField = PETSC_FALSE; 47 PetscBool useFieldDer = PETSC_TRUE; 48 PetscBool useFieldAux = useAux; 49 PetscBool useFieldDerAux= PETSC_FALSE; 50 PetscBool useF0 = PETSC_TRUE; 51 PetscBool useF1 = PETSC_TRUE; 52 const PetscReal *points, *weights; 53 PetscTabulation T; 54 PetscInt dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c; 55 size_t count; 56 PetscErrorCode ierr; 57 58 PetscFunctionBegin; 59 ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr); 60 ierr = PetscFEGetDimension(fem, &N_b);CHKERRQ(ierr); 61 ierr = PetscFEGetNumComponents(fem, &N_c);CHKERRQ(ierr); 62 ierr = PetscFEGetQuadrature(fem, &q);CHKERRQ(ierr); 63 ierr = PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);CHKERRQ(ierr); 64 if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc); 65 N_t = N_b * N_c * N_q * N_bl; 66 /* Enable device extension for double precision */ 67 if (ocl->realType == PETSC_DOUBLE) { 68 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 69 "#if defined(cl_khr_fp64)\n" 70 "# pragma OPENCL EXTENSION cl_khr_fp64: enable\n" 71 "#elif defined(cl_amd_fp64)\n" 72 "# pragma OPENCL EXTENSION cl_amd_fp64: enable\n" 73 "#endif\n", 74 &count);STRING_ERROR_CHECK("Message to short"); 75 numeric_str = &(double_str[0]); 76 } 77 /* Kernel API */ 78 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 79 "\n" 80 "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n" 81 "{\n", 82 &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str);STRING_ERROR_CHECK("Message to short"); 83 /* Quadrature */ 84 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 85 " /* Quadrature points\n" 86 " - (x1,y1,x2,y2,...) */\n" 87 " const %s points[%d] = {\n", 88 &count, numeric_str, N_q*dim);STRING_ERROR_CHECK("Message to short"); 89 for (p = 0; p < N_q; ++p) { 90 for (d = 0; d < dim; ++d) { 91 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p*dim+d]);STRING_ERROR_CHECK("Message to short"); 92 } 93 } 94 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short"); 95 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 96 " /* Quadrature weights\n" 97 " - (v1,v2,...) */\n" 98 " const %s weights[%d] = {\n", 99 &count, numeric_str, N_q);STRING_ERROR_CHECK("Message to short"); 100 for (p = 0; p < N_q; ++p) { 101 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p]);STRING_ERROR_CHECK("Message to short"); 102 } 103 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short"); 104 /* Basis Functions */ 105 ierr = PetscFEGetCellTabulation(fem, &T);CHKERRQ(ierr); 106 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 107 " /* Nodal basis function evaluations\n" 108 " - basis component is fastest varying, the basis function, then point */\n" 109 " const %s Basis[%d] = {\n", 110 &count, numeric_str, N_q*N_b*N_c);STRING_ERROR_CHECK("Message to short"); 111 for (p = 0; p < N_q; ++p) { 112 for (b = 0; b < N_b; ++b) { 113 for (c = 0; c < N_c; ++c) { 114 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p*N_b + b)*N_c + c]);STRING_ERROR_CHECK("Message to short"); 115 } 116 } 117 } 118 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short"); 119 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 120 "\n" 121 " /* Nodal basis function derivative evaluations,\n" 122 " - derivative direction is fastest varying, then basis component, then basis function, then point */\n" 123 " const %s%d BasisDerivatives[%d] = {\n", 124 &count, numeric_str, dim, N_q*N_b*N_c);STRING_ERROR_CHECK("Message to short"); 125 for (p = 0; p < N_q; ++p) { 126 for (b = 0; b < N_b; ++b) { 127 for (c = 0; c < N_c; ++c) { 128 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim);STRING_ERROR_CHECK("Message to short"); 129 for (d = 0; d < dim; ++d) { 130 if (d > 0) { 131 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);STRING_ERROR_CHECK("Message to short"); 132 } else { 133 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);STRING_ERROR_CHECK("Message to short"); 134 } 135 } 136 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count);STRING_ERROR_CHECK("Message to short"); 137 } 138 } 139 } 140 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);STRING_ERROR_CHECK("Message to short"); 141 /* Sizes */ 142 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 143 " const int dim = %d; // The spatial dimension\n" 144 " const int N_bl = %d; // The number of concurrent blocks\n" 145 " const int N_b = %d; // The number of basis functions\n" 146 " const int N_comp = %d; // The number of basis function components\n" 147 " const int N_bt = N_b*N_comp; // The total number of scalar basis functions\n" 148 " const int N_q = %d; // The number of quadrature points\n" 149 " const int N_bst = N_bt*N_q; // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n" 150 " const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl\n" 151 " const int N_bc = N_t/N_comp; // The number of cells per batch (N_b*N_q*N_bl)\n" 152 " const int N_sbc = N_bst / (N_q * N_comp);\n" 153 " const int N_sqc = N_bst / N_bt;\n" 154 " /*const int N_c = N_cb * N_bc;*/\n" 155 "\n" 156 " /* Calculated indices */\n" 157 " /*const int tidx = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n" 158 " const int tidx = get_local_id(0);\n" 159 " const int blidx = tidx / N_bst; // Block number for this thread\n" 160 " const int bidx = tidx %% N_bt; // Basis function mapped to this thread\n" 161 " const int cidx = tidx %% N_comp; // Basis component mapped to this thread\n" 162 " const int qidx = tidx %% N_q; // Quadrature point mapped to this thread\n" 163 " const int blbidx = tidx %% N_q + blidx*N_q; // Cell mapped to this thread in the basis phase\n" 164 " const int blqidx = tidx %% N_b + blidx*N_b; // Cell mapped to this thread in the quadrature phase\n" 165 " const int gidx = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n" 166 " const int Goffset = gidx*N_cb*N_bc;\n", 167 &count, dim, N_bl, N_b, N_c, N_q);STRING_ERROR_CHECK("Message to short"); 168 /* Local memory */ 169 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 170 "\n" 171 " /* Quadrature data */\n" 172 " %s w; // $w_q$, Quadrature weight at $x_q$\n" 173 " __local %s phi_i[%d]; //[N_bt*N_q]; // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n" 174 " __local %s%d phiDer_i[%d]; //[N_bt*N_q]; // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n" 175 " /* Geometric data */\n" 176 " __local %s detJ[%d]; //[N_t]; // $|J(x_q)|$, Jacobian determinant at $x_q$\n" 177 " __local %s invJ[%d];//[N_t*dim*dim]; // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n", 178 &count, numeric_str, numeric_str, N_b*N_c*N_q, numeric_str, dim, N_b*N_c*N_q, numeric_str, N_t, 179 numeric_str, N_t*dim*dim, numeric_str, N_t*N_b*N_c);STRING_ERROR_CHECK("Message to short"); 180 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 181 " /* FEM data */\n" 182 " __local %s u_i[%d]; //[N_t*N_bt]; // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n", 183 &count, numeric_str, N_t*N_b*N_c);STRING_ERROR_CHECK("Message to short"); 184 if (useAux) { 185 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 186 " __local %s a_i[%d]; //[N_t]; // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n", 187 &count, numeric_str, N_t);STRING_ERROR_CHECK("Message to short"); 188 } 189 if (useF0) { 190 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 191 " /* Intermediate calculations */\n" 192 " __local %s f_0[%d]; //[N_t*N_sqc]; // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", 193 &count, numeric_str, N_t*N_q);STRING_ERROR_CHECK("Message to short"); 194 } 195 if (useF1) { 196 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 197 " __local %s%d f_1[%d]; //[N_t*N_sqc]; // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", 198 &count, numeric_str, dim, N_t*N_q);STRING_ERROR_CHECK("Message to short"); 199 } 200 /* TODO: If using elasticity, put in mu/lambda coefficients */ 201 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 202 " /* Output data */\n" 203 " %s e_i; // Coefficient $e_i$ of the residual\n\n", 204 &count, numeric_str);STRING_ERROR_CHECK("Message to short"); 205 /* One-time loads */ 206 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 207 " /* These should be generated inline */\n" 208 " /* Load quadrature weights */\n" 209 " w = weights[qidx];\n" 210 " /* Load basis tabulation \\phi_i for this cell */\n" 211 " if (tidx < N_bt*N_q) {\n" 212 " phi_i[tidx] = Basis[tidx];\n" 213 " phiDer_i[tidx] = BasisDerivatives[tidx];\n" 214 " }\n\n", 215 &count);STRING_ERROR_CHECK("Message to short"); 216 /* Batch loads */ 217 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 218 " for (int batch = 0; batch < N_cb; ++batch) {\n" 219 " /* Load geometry */\n" 220 " detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n" 221 " for (int n = 0; n < dim*dim; ++n) {\n" 222 " const int offset = n*N_t;\n" 223 " invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n" 224 " }\n" 225 " /* Load coefficients u_i for this cell */\n" 226 " for (int n = 0; n < N_bt; ++n) {\n" 227 " const int offset = n*N_t;\n" 228 " u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n" 229 " }\n", 230 &count);STRING_ERROR_CHECK("Message to short"); 231 if (useAux) { 232 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 233 " /* Load coefficients a_i for this cell */\n" 234 " /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n" 235 " a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n", 236 &count);STRING_ERROR_CHECK("Message to short"); 237 } 238 /* Quadrature phase */ 239 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 240 " barrier(CLK_LOCAL_MEM_FENCE);\n" 241 "\n" 242 " /* Map coefficients to values at quadrature points */\n" 243 " for (int c = 0; c < N_sqc; ++c) {\n" 244 " const int cell = c*N_bl*N_b + blqidx;\n" 245 " const int fidx = (cell*N_q + qidx)*N_comp + cidx;\n", 246 &count);STRING_ERROR_CHECK("Message to short"); 247 if (useField) { 248 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 249 " %s u[%d]; //[N_comp]; // $u(x_q)$, Value of the field at $x_q$\n", 250 &count, numeric_str, N_c);STRING_ERROR_CHECK("Message to short"); 251 } 252 if (useFieldDer) { 253 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 254 " %s%d gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n", 255 &count, numeric_str, dim, N_c);STRING_ERROR_CHECK("Message to short"); 256 } 257 if (useFieldAux) { 258 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 259 " %s a[%d]; //[1]; // $a(x_q)$, Value of the auxiliary fields at $x_q$\n", 260 &count, numeric_str, 1);STRING_ERROR_CHECK("Message to short"); 261 } 262 if (useFieldDerAux) { 263 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 264 " %s%d gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n", 265 &count, numeric_str, dim, 1);STRING_ERROR_CHECK("Message to short"); 266 } 267 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 268 "\n" 269 " for (int comp = 0; comp < N_comp; ++comp) {\n", 270 &count);STRING_ERROR_CHECK("Message to short"); 271 if (useField) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " u[comp] = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");} 272 if (useFieldDer) { 273 switch (dim) { 274 case 1: 275 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break; 276 case 2: 277 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break; 278 case 3: 279 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break; 280 } 281 } 282 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 283 " }\n", 284 &count);STRING_ERROR_CHECK("Message to short"); 285 if (useFieldAux) { 286 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " a[0] = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short"); 287 } 288 if (useFieldDerAux) { 289 switch (dim) { 290 case 1: 291 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break; 292 case 2: 293 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break; 294 case 3: 295 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count);STRING_ERROR_CHECK("Message to short");break; 296 } 297 } 298 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 299 " /* Get field and derivatives at this quadrature point */\n" 300 " for (int i = 0; i < N_b; ++i) {\n" 301 " for (int comp = 0; comp < N_comp; ++comp) {\n" 302 " const int b = i*N_comp+comp;\n" 303 " const int pidx = qidx*N_bt + b;\n" 304 " const int uidx = cell*N_bt + b;\n" 305 " %s%d realSpaceDer;\n\n", 306 &count, numeric_str, dim);STRING_ERROR_CHECK("Message to short"); 307 if (useField) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," u[comp] += u_i[uidx]*phi_i[pidx];\n", &count);STRING_ERROR_CHECK("Message to short");} 308 if (useFieldDer) { 309 switch (dim) { 310 case 2: 311 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 312 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n" 313 " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n" 314 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n" 315 " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n", 316 &count);STRING_ERROR_CHECK("Message to short");break; 317 case 3: 318 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 319 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n" 320 " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n" 321 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n" 322 " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n" 323 " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n" 324 " gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n", 325 &count);STRING_ERROR_CHECK("Message to short");break; 326 } 327 } 328 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 329 " }\n" 330 " }\n", 331 &count);STRING_ERROR_CHECK("Message to short"); 332 if (useFieldAux) { 333 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," a[0] += a_i[cell];\n", &count);STRING_ERROR_CHECK("Message to short"); 334 } 335 /* Calculate residual at quadrature points: Should be generated by an weak form egine */ 336 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 337 " /* Process values at quadrature points */\n", 338 &count);STRING_ERROR_CHECK("Message to short"); 339 switch (op) { 340 case LAPLACIAN: 341 if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count);STRING_ERROR_CHECK("Message to short");} 342 if (useF1) { 343 if (useAux) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = a[0]*gradU[cidx];\n", &count);STRING_ERROR_CHECK("Message to short");} 344 else {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = gradU[cidx];\n", &count);STRING_ERROR_CHECK("Message to short");} 345 } 346 break; 347 case ELASTICITY: 348 if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count);STRING_ERROR_CHECK("Message to short");} 349 if (useF1) { 350 switch (dim) { 351 case 2: 352 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 353 " switch (cidx) {\n" 354 " case 0:\n" 355 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n" 356 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n" 357 " break;\n" 358 " case 1:\n" 359 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n" 360 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n" 361 " }\n", 362 &count);STRING_ERROR_CHECK("Message to short");break; 363 case 3: 364 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 365 " switch (cidx) {\n" 366 " case 0:\n" 367 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n" 368 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n" 369 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n" 370 " break;\n" 371 " case 1:\n" 372 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n" 373 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n" 374 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n" 375 " break;\n" 376 " case 2:\n" 377 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n" 378 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n" 379 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n" 380 " }\n", 381 &count);STRING_ERROR_CHECK("Message to short");break; 382 }} 383 break; 384 default: 385 SETERRQ1(PETSC_COMM_WORLD, PETSC_ERR_SUP, "PDE operator %d is not supported", op); 386 } 387 if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_0[fidx] *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");} 388 if (useF1) { 389 switch (dim) { 390 case 1: 391 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");break; 392 case 2: 393 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");break; 394 case 3: 395 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count);STRING_ERROR_CHECK("Message to short");break; 396 } 397 } 398 /* Thread transpose */ 399 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 400 " }\n\n" 401 " /* ==== TRANSPOSE THREADS ==== */\n" 402 " barrier(CLK_LOCAL_MEM_FENCE);\n\n", 403 &count);STRING_ERROR_CHECK("Message to short"); 404 /* Basis phase */ 405 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 406 " /* Map values at quadrature points to coefficients */\n" 407 " for (int c = 0; c < N_sbc; ++c) {\n" 408 " const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n" 409 "\n" 410 " e_i = 0.0;\n" 411 " for (int q = 0; q < N_q; ++q) {\n" 412 " const int pidx = q*N_bt + bidx;\n" 413 " const int fidx = (cell*N_q + q)*N_comp + cidx;\n" 414 " %s%d realSpaceDer;\n\n", 415 &count, numeric_str, dim);STRING_ERROR_CHECK("Message to short"); 416 417 if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," e_i += phi_i[pidx]*f_0[fidx];\n", &count);STRING_ERROR_CHECK("Message to short");} 418 if (useF1) { 419 switch (dim) { 420 case 2: 421 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 422 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n" 423 " e_i += realSpaceDer.x*f_1[fidx].x;\n" 424 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n" 425 " e_i += realSpaceDer.y*f_1[fidx].y;\n", 426 &count);STRING_ERROR_CHECK("Message to short");break; 427 case 3: 428 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 429 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n" 430 " e_i += realSpaceDer.x*f_1[fidx].x;\n" 431 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n" 432 " e_i += realSpaceDer.y*f_1[fidx].y;\n" 433 " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n" 434 " e_i += realSpaceDer.z*f_1[fidx].z;\n", 435 &count);STRING_ERROR_CHECK("Message to short");break; 436 } 437 } 438 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, 439 " }\n" 440 " /* Write element vector for N_{cbc} cells at a time */\n" 441 " elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n" 442 " }\n" 443 " /* ==== Could do one write per batch ==== */\n" 444 " }\n" 445 " return;\n" 446 "}\n", 447 &count);STRING_ERROR_CHECK("Message to short"); 448 PetscFunctionReturn(0); 449 } 450 451 static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel) 452 { 453 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 454 PetscInt dim, N_bl; 455 PetscBool flg; 456 char *buffer; 457 size_t len; 458 char errMsg[8192]; 459 cl_int ierr2; 460 PetscErrorCode ierr; 461 462 PetscFunctionBegin; 463 ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr); 464 ierr = PetscMalloc1(8192, &buffer);CHKERRQ(ierr); 465 ierr = PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL);CHKERRQ(ierr); 466 ierr = PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl);CHKERRQ(ierr); 467 ierr = PetscOptionsHasName(((PetscObject)fem)->options,((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg);CHKERRQ(ierr); 468 if (flg) {ierr = PetscPrintf(PetscObjectComm((PetscObject) fem), "OpenCL FE Integration Kernel:\n%s\n", buffer);CHKERRQ(ierr);} 469 len = strlen(buffer); 470 *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **) &buffer, &len, &ierr2);CHKERRQ(ierr2); 471 ierr = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL); 472 if (ierr != CL_SUCCESS) { 473 ierr = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192*sizeof(char), &errMsg, NULL);CHKERRQ(ierr); 474 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg); 475 } 476 ierr = PetscFree(buffer);CHKERRQ(ierr); 477 *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &ierr);CHKERRQ(ierr); 478 PetscFunctionReturn(0); 479 } 480 481 static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z) 482 { 483 const PetscInt Nblocks = N/blockSize; 484 485 PetscFunctionBegin; 486 if (N % blockSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N); 487 *z = 1; 488 *y = 1; 489 for (*x = (size_t) (PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) { 490 *y = Nblocks / *x; 491 if (*x * *y == Nblocks) break; 492 } 493 if (*x * *y != Nblocks) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %d with block size %d", N, blockSize); 494 PetscFunctionReturn(0); 495 } 496 497 static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops) 498 { 499 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 500 PetscStageLog stageLog; 501 PetscEventPerfLog eventLog = NULL; 502 int stage; 503 PetscErrorCode ierr; 504 505 PetscFunctionBegin; 506 ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); 507 ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); 508 ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); 509 /* Log performance info */ 510 eventLog->eventInfo[ocl->residualEvent].count++; 511 eventLog->eventInfo[ocl->residualEvent].time += time; 512 eventLog->eventInfo[ocl->residualEvent].flops += flops; 513 PetscFunctionReturn(0); 514 } 515 516 static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscInt field, PetscInt Ne, PetscFEGeom *cgeom, 517 const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[]) 518 { 519 /* Nbc = batchSize */ 520 PetscFE fem; 521 PetscFE_OpenCL *ocl; 522 PetscPointFunc f0_func; 523 PetscPointFunc f1_func; 524 PetscQuadrature q; 525 PetscInt dim, qNc; 526 PetscInt N_b; /* The number of basis functions */ 527 PetscInt N_comp; /* The number of basis function components */ 528 PetscInt N_bt; /* The total number of scalar basis functions */ 529 PetscInt N_q; /* The number of quadrature points */ 530 PetscInt N_bst; /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */ 531 PetscInt N_t; /* The number of threads, N_bst * N_bl */ 532 PetscInt N_bl; /* The number of blocks */ 533 PetscInt N_bc; /* The batch size, N_bl*N_q*N_b */ 534 PetscInt N_cb; /* The number of batches */ 535 PetscInt numFlops, f0Flops = 0, f1Flops = 0; 536 PetscBool useAux = probAux ? PETSC_TRUE : PETSC_FALSE; 537 PetscBool useField = PETSC_FALSE; 538 PetscBool useFieldDer = PETSC_TRUE; 539 PetscBool useF0 = PETSC_TRUE; 540 PetscBool useF1 = PETSC_TRUE; 541 /* OpenCL variables */ 542 cl_program ocl_prog; 543 cl_kernel ocl_kernel; 544 cl_event ocl_ev; /* The event for tracking kernel execution */ 545 cl_ulong ns_start; /* Nanoseconds counter on GPU at kernel start */ 546 cl_ulong ns_end; /* Nanoseconds counter on GPU at kernel stop */ 547 cl_mem o_jacobianInverses, o_jacobianDeterminants; 548 cl_mem o_coefficients, o_coefficientsAux, o_elemVec; 549 float *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL; 550 double *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL; 551 PetscReal *r_invJ = NULL, *r_detJ = NULL; 552 void *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ; 553 size_t local_work_size[3], global_work_size[3]; 554 size_t realSize, x, y, z; 555 const PetscReal *points, *weights; 556 PetscErrorCode ierr; 557 558 PetscFunctionBegin; 559 ierr = PetscDSGetDiscretization(prob, field, (PetscObject *) &fem);CHKERRQ(ierr); 560 ocl = (PetscFE_OpenCL *) fem->data; 561 if (!Ne) {ierr = PetscFEOpenCLLogResidual(fem, 0.0, 0.0);CHKERRQ(ierr); PetscFunctionReturn(0);} 562 ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr); 563 ierr = PetscFEGetQuadrature(fem, &q);CHKERRQ(ierr); 564 ierr = PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);CHKERRQ(ierr); 565 if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc); 566 ierr = PetscFEGetDimension(fem, &N_b);CHKERRQ(ierr); 567 ierr = PetscFEGetNumComponents(fem, &N_comp);CHKERRQ(ierr); 568 ierr = PetscDSGetResidual(prob, field, &f0_func, &f1_func);CHKERRQ(ierr); 569 ierr = PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb);CHKERRQ(ierr); 570 N_bt = N_b*N_comp; 571 N_bst = N_bt*N_q; 572 N_t = N_bst*N_bl; 573 if (N_bc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp); 574 /* Calculate layout */ 575 if (Ne % (N_cb*N_bc)) { /* Remainder cells */ 576 ierr = PetscFEIntegrateResidual_Basic(prob, field, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec);CHKERRQ(ierr); 577 PetscFunctionReturn(0); 578 } 579 ierr = PetscFEOpenCLCalculateGrid(fem, Ne, N_cb*N_bc, &x, &y, &z);CHKERRQ(ierr); 580 local_work_size[0] = N_bc*N_comp; 581 local_work_size[1] = 1; 582 local_work_size[2] = 1; 583 global_work_size[0] = x * local_work_size[0]; 584 global_work_size[1] = y * local_work_size[1]; 585 global_work_size[2] = z * local_work_size[2]; 586 ierr = PetscInfo7(fem, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb);CHKERRQ(ierr); 587 ierr = PetscInfo2(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb); 588 /* Generate code */ 589 if (probAux) { 590 PetscSpace P; 591 PetscInt NfAux, order, f; 592 593 ierr = PetscDSGetNumFields(probAux, &NfAux);CHKERRQ(ierr); 594 for (f = 0; f < NfAux; ++f) { 595 PetscFE feAux; 596 597 ierr = PetscDSGetDiscretization(probAux, f, (PetscObject *) &feAux);CHKERRQ(ierr); 598 ierr = PetscFEGetBasisSpace(feAux, &P);CHKERRQ(ierr); 599 ierr = PetscSpaceGetDegree(P, &order, NULL);CHKERRQ(ierr); 600 if (order > 0) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields"); 601 } 602 } 603 ierr = PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel);CHKERRQ(ierr); 604 /* Create buffers on the device and send data over */ 605 ierr = PetscDataTypeGetSize(ocl->realType, &realSize);CHKERRQ(ierr); 606 if (cgeom->numPoints > 1) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only support affine geometry for OpenCL integration right now"); 607 if (sizeof(PetscReal) != realSize) { 608 switch (ocl->realType) { 609 case PETSC_FLOAT: 610 { 611 PetscInt c, b, d; 612 613 ierr = PetscMalloc4(Ne*N_bt,&f_coeff,Ne,&f_coeffAux,Ne*dim*dim,&f_invJ,Ne,&f_detJ);CHKERRQ(ierr); 614 for (c = 0; c < Ne; ++c) { 615 f_detJ[c] = (float) cgeom->detJ[c]; 616 for (d = 0; d < dim*dim; ++d) { 617 f_invJ[c*dim*dim+d] = (float) cgeom->invJ[c * dim * dim + d]; 618 } 619 for (b = 0; b < N_bt; ++b) { 620 f_coeff[c*N_bt+b] = (float) coefficients[c*N_bt+b]; 621 } 622 } 623 if (coefficientsAux) { /* Assume P0 */ 624 for (c = 0; c < Ne; ++c) { 625 f_coeffAux[c] = (float) coefficientsAux[c]; 626 } 627 } 628 oclCoeff = (void *) f_coeff; 629 if (coefficientsAux) { 630 oclCoeffAux = (void *) f_coeffAux; 631 } else { 632 oclCoeffAux = NULL; 633 } 634 oclInvJ = (void *) f_invJ; 635 oclDetJ = (void *) f_detJ; 636 } 637 break; 638 case PETSC_DOUBLE: 639 { 640 PetscInt c, b, d; 641 642 ierr = PetscMalloc4(Ne*N_bt,&d_coeff,Ne,&d_coeffAux,Ne*dim*dim,&d_invJ,Ne,&d_detJ);CHKERRQ(ierr); 643 for (c = 0; c < Ne; ++c) { 644 d_detJ[c] = (double) cgeom->detJ[c]; 645 for (d = 0; d < dim*dim; ++d) { 646 d_invJ[c*dim*dim+d] = (double) cgeom->invJ[c * dim * dim + d]; 647 } 648 for (b = 0; b < N_bt; ++b) { 649 d_coeff[c*N_bt+b] = (double) coefficients[c*N_bt+b]; 650 } 651 } 652 if (coefficientsAux) { /* Assume P0 */ 653 for (c = 0; c < Ne; ++c) { 654 d_coeffAux[c] = (double) coefficientsAux[c]; 655 } 656 } 657 oclCoeff = (void *) d_coeff; 658 if (coefficientsAux) { 659 oclCoeffAux = (void *) d_coeffAux; 660 } else { 661 oclCoeffAux = NULL; 662 } 663 oclInvJ = (void *) d_invJ; 664 oclDetJ = (void *) d_detJ; 665 } 666 break; 667 default: 668 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType); 669 } 670 } else { 671 PetscInt c, d; 672 673 ierr = PetscMalloc2(Ne*dim*dim,&r_invJ,Ne,&r_detJ);CHKERRQ(ierr); 674 for (c = 0; c < Ne; ++c) { 675 r_detJ[c] = cgeom->detJ[c]; 676 for (d = 0; d < dim*dim; ++d) { 677 r_invJ[c*dim*dim+d] = cgeom->invJ[c * dim * dim + d]; 678 } 679 } 680 oclCoeff = (void *) coefficients; 681 oclCoeffAux = (void *) coefficientsAux; 682 oclInvJ = (void *) r_invJ; 683 oclDetJ = (void *) r_detJ; 684 } 685 o_coefficients = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*N_bt * realSize, oclCoeff, &ierr);CHKERRQ(ierr); 686 if (coefficientsAux) { 687 o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclCoeffAux, &ierr);CHKERRQ(ierr); 688 } else { 689 o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY, Ne * realSize, oclCoeffAux, &ierr);CHKERRQ(ierr); 690 } 691 o_jacobianInverses = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * realSize, oclInvJ, &ierr);CHKERRQ(ierr); 692 o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclDetJ, &ierr);CHKERRQ(ierr); 693 o_elemVec = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY, Ne*N_bt * realSize, NULL, &ierr);CHKERRQ(ierr); 694 /* Kernel launch */ 695 ierr = clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*) &N_cb);CHKERRQ(ierr); 696 ierr = clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*) &o_coefficients);CHKERRQ(ierr); 697 ierr = clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*) &o_coefficientsAux);CHKERRQ(ierr); 698 ierr = clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*) &o_jacobianInverses);CHKERRQ(ierr); 699 ierr = clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*) &o_jacobianDeterminants);CHKERRQ(ierr); 700 ierr = clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void*) &o_elemVec);CHKERRQ(ierr); 701 ierr = clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);CHKERRQ(ierr); 702 /* Read data back from device */ 703 if (sizeof(PetscReal) != realSize) { 704 switch (ocl->realType) { 705 case PETSC_FLOAT: 706 { 707 float *elem; 708 PetscInt c, b; 709 710 ierr = PetscFree4(f_coeff,f_coeffAux,f_invJ,f_detJ);CHKERRQ(ierr); 711 ierr = PetscMalloc1(Ne*N_bt, &elem);CHKERRQ(ierr); 712 ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);CHKERRQ(ierr); 713 for (c = 0; c < Ne; ++c) { 714 for (b = 0; b < N_bt; ++b) { 715 elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b]; 716 } 717 } 718 ierr = PetscFree(elem);CHKERRQ(ierr); 719 } 720 break; 721 case PETSC_DOUBLE: 722 { 723 double *elem; 724 PetscInt c, b; 725 726 ierr = PetscFree4(d_coeff,d_coeffAux,d_invJ,d_detJ);CHKERRQ(ierr); 727 ierr = PetscMalloc1(Ne*N_bt, &elem);CHKERRQ(ierr); 728 ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);CHKERRQ(ierr); 729 for (c = 0; c < Ne; ++c) { 730 for (b = 0; b < N_bt; ++b) { 731 elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b]; 732 } 733 } 734 ierr = PetscFree(elem);CHKERRQ(ierr); 735 } 736 break; 737 default: 738 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType); 739 } 740 } else { 741 ierr = PetscFree2(r_invJ,r_detJ);CHKERRQ(ierr); 742 ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elemVec, 0, NULL, NULL);CHKERRQ(ierr); 743 } 744 /* Log performance */ 745 ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);CHKERRQ(ierr); 746 ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL);CHKERRQ(ierr); 747 f0Flops = 0; 748 switch (ocl->op) { 749 case LAPLACIAN: 750 f1Flops = useAux ? dim : 0;break; 751 case ELASTICITY: 752 f1Flops = 2*dim*dim;break; 753 } 754 numFlops = Ne*( 755 N_q*( 756 N_b*N_comp*((useField ? 2 : 0) + (useFieldDer ? 2*dim*(dim + 1) : 0)) 757 /*+ 758 N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/ 759 + 760 N_comp*((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2*dim : 0))) 761 + 762 N_b*((useF0 ? 2 : 0) + (useF1 ? 2*dim*(dim + 1) : 0))); 763 ierr = PetscFEOpenCLLogResidual(fem, (ns_end - ns_start)*1.0e-9, numFlops);CHKERRQ(ierr); 764 /* Cleanup */ 765 ierr = clReleaseMemObject(o_coefficients);CHKERRQ(ierr); 766 ierr = clReleaseMemObject(o_coefficientsAux);CHKERRQ(ierr); 767 ierr = clReleaseMemObject(o_jacobianInverses);CHKERRQ(ierr); 768 ierr = clReleaseMemObject(o_jacobianDeterminants);CHKERRQ(ierr); 769 ierr = clReleaseMemObject(o_elemVec);CHKERRQ(ierr); 770 ierr = clReleaseKernel(ocl_kernel);CHKERRQ(ierr); 771 ierr = clReleaseProgram(ocl_prog);CHKERRQ(ierr); 772 PetscFunctionReturn(0); 773 } 774 775 PETSC_EXTERN PetscErrorCode PetscFESetUp_Basic(PetscFE); 776 PETSC_EXTERN PetscErrorCode PetscFECreateTabulation_Basic(PetscFE, PetscInt, const PetscReal [], PetscInt, PetscTabulation); 777 778 static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem) 779 { 780 PetscFunctionBegin; 781 fem->ops->setfromoptions = NULL; 782 fem->ops->setup = PetscFESetUp_Basic; 783 fem->ops->view = NULL; 784 fem->ops->destroy = PetscFEDestroy_OpenCL; 785 fem->ops->getdimension = PetscFEGetDimension_Basic; 786 fem->ops->createtabulation = PetscFECreateTabulation_Basic; 787 fem->ops->integrateresidual = PetscFEIntegrateResidual_OpenCL; 788 fem->ops->integratebdresidual = NULL/* PetscFEIntegrateBdResidual_OpenCL */; 789 fem->ops->integratejacobianaction = NULL/* PetscFEIntegrateJacobianAction_OpenCL */; 790 fem->ops->integratejacobian = PetscFEIntegrateJacobian_Basic; 791 PetscFunctionReturn(0); 792 } 793 794 /*MC 795 PETSCFEOPENCL = "opencl" - A PetscFE object that integrates using a vectorized OpenCL implementation 796 797 Level: intermediate 798 799 .seealso: PetscFEType, PetscFECreate(), PetscFESetType() 800 M*/ 801 802 PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem) 803 { 804 PetscFE_OpenCL *ocl; 805 cl_uint num_platforms; 806 cl_platform_id platform_ids[42]; 807 cl_uint num_devices; 808 cl_device_id device_ids[42]; 809 cl_int ierr2; 810 PetscErrorCode ierr; 811 812 PetscFunctionBegin; 813 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 814 ierr = PetscNewLog(fem,&ocl);CHKERRQ(ierr); 815 fem->data = ocl; 816 817 /* Init Platform */ 818 ierr = clGetPlatformIDs(42, platform_ids, &num_platforms);CHKERRQ(ierr); 819 if (!num_platforms) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL platform found."); 820 ocl->pf_id = platform_ids[0]; 821 /* Init Device */ 822 ierr = clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices);CHKERRQ(ierr); 823 if (!num_devices) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL device found."); 824 ocl->dev_id = device_ids[0]; 825 /* Create context with one command queue */ 826 ocl->ctx_id = clCreateContext(0, 1, &(ocl->dev_id), NULL, NULL, &ierr2);CHKERRQ(ierr2); 827 ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &ierr2);CHKERRQ(ierr2); 828 /* Types */ 829 ocl->realType = PETSC_FLOAT; 830 /* Register events */ 831 ierr = PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent);CHKERRQ(ierr); 832 /* Equation handling */ 833 ocl->op = LAPLACIAN; 834 835 ierr = PetscFEInitialize_OpenCL(fem);CHKERRQ(ierr); 836 PetscFunctionReturn(0); 837 } 838 839 /*@ 840 PetscFEOpenCLSetRealType - Set the scalar type for running on the accelerator 841 842 Input Parameters: 843 + fem - The PetscFE 844 - realType - The scalar type 845 846 Level: developer 847 848 .seealso: PetscFEOpenCLGetRealType() 849 @*/ 850 PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType) 851 { 852 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 853 854 PetscFunctionBegin; 855 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 856 ocl->realType = realType; 857 PetscFunctionReturn(0); 858 } 859 860 /*@ 861 PetscFEOpenCLGetRealType - Get the scalar type for running on the accelerator 862 863 Input Parameter: 864 . fem - The PetscFE 865 866 Output Parameter: 867 . realType - The scalar type 868 869 Level: developer 870 871 .seealso: PetscFEOpenCLSetRealType() 872 @*/ 873 PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType) 874 { 875 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data; 876 877 PetscFunctionBegin; 878 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1); 879 PetscValidPointer(realType, 2); 880 *realType = ocl->realType; 881 PetscFunctionReturn(0); 882 } 883 884 #endif /* PETSC_HAVE_OPENCL */ 885