1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed/ceed.h> 18 #include <ceed/backend.h> 19 #include <hip/hip_runtime.h> 20 #include "ceed-hip-ref.h" 21 #include "../hip/ceed-hip-compile.h" 22 23 //------------------------------------------------------------------------------ 24 // Tensor Basis Kernels 25 //------------------------------------------------------------------------------ 26 // *INDENT-OFF* 27 static const char *basiskernels = QUOTE( 28 29 //------------------------------------------------------------------------------ 30 // Interp 31 //------------------------------------------------------------------------------ 32 extern "C" __global__ void interp(const CeedInt nelem, const int transpose, 33 const CeedScalar *__restrict__ interp1d, 34 const CeedScalar *__restrict__ u, 35 CeedScalar *__restrict__ v) { 36 const CeedInt i = threadIdx.x; 37 38 __shared__ CeedScalar s_mem[BASIS_Q1D * BASIS_P1D + 2 * BASIS_BUF_LEN]; 39 CeedScalar *s_interp1d = s_mem; 40 CeedScalar *s_buf1 = s_mem + BASIS_Q1D * BASIS_P1D; 41 CeedScalar *s_buf2 = s_buf1 + BASIS_BUF_LEN; 42 for (CeedInt k = i; k < BASIS_Q1D * BASIS_P1D; k += blockDim.x) { 43 s_interp1d[k] = interp1d[k]; 44 } 45 46 const CeedInt P = transpose ? BASIS_Q1D : BASIS_P1D; 47 const CeedInt Q = transpose ? BASIS_P1D : BASIS_Q1D; 48 const CeedInt stride0 = transpose ? 1 : BASIS_P1D; 49 const CeedInt stride1 = transpose ? BASIS_P1D : 1; 50 const CeedInt u_stride = transpose ? BASIS_NQPT : BASIS_ELEMSIZE; 51 const CeedInt v_stride = transpose ? BASIS_ELEMSIZE : BASIS_NQPT; 52 const CeedInt u_comp_stride = nelem * (transpose ? BASIS_NQPT : BASIS_ELEMSIZE); 53 const CeedInt v_comp_stride = nelem * (transpose ? BASIS_ELEMSIZE : BASIS_NQPT); 54 const CeedInt u_size = transpose ? BASIS_NQPT : BASIS_ELEMSIZE; 55 56 // Apply basis element by element 57 for (CeedInt elem = blockIdx.x; elem < nelem; elem += gridDim.x) { 58 for (CeedInt comp = 0; comp < BASIS_NCOMP; ++comp) { 59 const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 60 CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 61 for (CeedInt k = i; k < u_size; k += blockDim.x) { 62 s_buf1[k] = cur_u[k]; 63 } 64 CeedInt pre = u_size; 65 CeedInt post = 1; 66 for (CeedInt d = 0; d < BASIS_DIM; d++) { 67 __syncthreads(); 68 // Update buffers used 69 pre /= P; 70 const CeedScalar *in = d % 2 ? s_buf2 : s_buf1; 71 CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buf1 : s_buf2); 72 73 // Contract along middle index 74 const CeedInt writeLen = pre * post * Q; 75 for (CeedInt k = i; k < writeLen; k += blockDim.x) { 76 const CeedInt c = k % post; 77 const CeedInt j = (k / post) % Q; 78 const CeedInt a = k / (post * Q); 79 80 CeedScalar vk = 0; 81 for (CeedInt b = 0; b < P; b++) 82 vk += s_interp1d[j*stride0 + b*stride1] * in[(a*P + b)*post + c]; 83 84 out[k] = vk; 85 } 86 87 post *= Q; 88 } 89 } 90 } 91 } 92 93 //------------------------------------------------------------------------------ 94 // Grad 95 //------------------------------------------------------------------------------ 96 extern "C" __global__ void grad(const CeedInt nelem, const int transpose, 97 const CeedScalar *__restrict__ interp1d, 98 const CeedScalar *__restrict__ grad1d, 99 const CeedScalar *__restrict__ u, 100 CeedScalar *__restrict__ v) { 101 const CeedInt i = threadIdx.x; 102 103 __shared__ CeedScalar s_mem[2 * (BASIS_Q1D * BASIS_P1D + BASIS_BUF_LEN)]; 104 CeedScalar *s_interp1d = s_mem; 105 CeedScalar *s_grad1d = s_interp1d + BASIS_Q1D * BASIS_P1D; 106 CeedScalar *s_buf1 = s_grad1d + BASIS_Q1D * BASIS_P1D; 107 CeedScalar *s_buf2 = s_buf1 + BASIS_BUF_LEN; 108 for (CeedInt k = i; k < BASIS_Q1D * BASIS_P1D; k += blockDim.x) { 109 s_interp1d[k] = interp1d[k]; 110 s_grad1d[k] = grad1d[k]; 111 } 112 113 const CeedInt P = transpose ? BASIS_Q1D : BASIS_P1D; 114 const CeedInt Q = transpose ? BASIS_P1D : BASIS_Q1D; 115 const CeedInt stride0 = transpose ? 1 : BASIS_P1D; 116 const CeedInt stride1 = transpose ? BASIS_P1D : 1; 117 const CeedInt u_stride = transpose ? BASIS_NQPT : BASIS_ELEMSIZE; 118 const CeedInt v_stride = transpose ? BASIS_ELEMSIZE : BASIS_NQPT; 119 const CeedInt u_comp_stride = nelem * (transpose ? BASIS_NQPT : BASIS_ELEMSIZE); 120 const CeedInt v_comp_stride = nelem * (transpose ? BASIS_ELEMSIZE : BASIS_NQPT); 121 const CeedInt u_dim_stride = transpose ? nelem * BASIS_NQPT * BASIS_NCOMP : 0; 122 const CeedInt v_dim_stride = transpose ? 0 : nelem * BASIS_NQPT * BASIS_NCOMP; 123 124 // Apply basis element by element 125 for (CeedInt elem = blockIdx.x; elem < nelem; elem += gridDim.x) { 126 for (CeedInt comp = 0; comp < BASIS_NCOMP; ++comp) { 127 128 // dim*dim contractions for grad 129 for (CeedInt dim1 = 0; dim1 < BASIS_DIM; dim1++) { 130 CeedInt pre = transpose ? BASIS_NQPT : BASIS_ELEMSIZE; 131 CeedInt post = 1; 132 const CeedScalar *cur_u = u + elem * u_stride + dim1 * u_dim_stride + 133 comp * u_comp_stride; 134 CeedScalar *cur_v = v + elem * v_stride + dim1 * v_dim_stride + comp * 135 v_comp_stride; 136 for (CeedInt dim2 = 0; dim2 < BASIS_DIM; dim2++) { 137 __syncthreads(); 138 // Update buffers used 139 pre /= P; 140 const CeedScalar *op = dim1 == dim2 ? s_grad1d : s_interp1d; 141 const CeedScalar *in = dim2 == 0 ? cur_u : (dim2 % 2 ? s_buf2 : s_buf1); 142 CeedScalar *out = dim2 == BASIS_DIM - 1 ? cur_v : (dim2 % 2 ? s_buf1 : s_buf2); 143 144 // Contract along middle index 145 const CeedInt writeLen = pre * post * Q; 146 for (CeedInt k = i; k < writeLen; k += blockDim.x) { 147 const CeedInt c = k % post; 148 const CeedInt j = (k / post) % Q; 149 const CeedInt a = k / (post * Q); 150 CeedScalar vk = 0; 151 for (CeedInt b = 0; b < P; b++) 152 vk += op[j * stride0 + b * stride1] * in[(a * P + b) * post + c]; 153 154 if (transpose && dim2 == BASIS_DIM - 1) 155 out[k] += vk; 156 else 157 out[k] = vk; 158 } 159 160 post *= Q; 161 } 162 } 163 } 164 } 165 } 166 167 //------------------------------------------------------------------------------ 168 // 1D quadrature weights 169 //------------------------------------------------------------------------------ 170 __device__ void weight1d(const CeedInt nelem, const CeedScalar *qweight1d, 171 CeedScalar *w) { 172 CeedScalar w1d[BASIS_Q1D]; 173 for (int i = 0; i < BASIS_Q1D; ++i) 174 w1d[i] = qweight1d[i]; 175 176 for (int e = blockIdx.x * blockDim.x + threadIdx.x; 177 e < nelem; 178 e += blockDim.x * gridDim.x) 179 for (int i = 0; i < BASIS_Q1D; ++i) { 180 const int ind = e*BASIS_Q1D + i; // sequential 181 w[ind] = w1d[i]; 182 } 183 } 184 185 //------------------------------------------------------------------------------ 186 // 2D quadrature weights 187 //------------------------------------------------------------------------------ 188 __device__ void weight2d(const CeedInt nelem, const CeedScalar *qweight1d, 189 CeedScalar *w) { 190 CeedScalar w1d[BASIS_Q1D]; 191 for (int i = 0; i < BASIS_Q1D; ++i) 192 w1d[i] = qweight1d[i]; 193 194 for (int e = blockIdx.x * blockDim.x + threadIdx.x; 195 e < nelem; 196 e += blockDim.x * gridDim.x) 197 for (int i = 0; i < BASIS_Q1D; ++i) 198 for (int j = 0; j < BASIS_Q1D; ++j) { 199 const int ind = e*BASIS_Q1D*BASIS_Q1D + i + j*BASIS_Q1D; // sequential 200 w[ind] = w1d[i]*w1d[j]; 201 } 202 } 203 204 //------------------------------------------------------------------------------ 205 // 3D quadrature weights 206 //------------------------------------------------------------------------------ 207 __device__ void weight3d(const CeedInt nelem, const CeedScalar *qweight1d, 208 CeedScalar *w) { 209 CeedScalar w1d[BASIS_Q1D]; 210 for (int i = 0; i < BASIS_Q1D; ++i) 211 w1d[i] = qweight1d[i]; 212 213 for (int e = blockIdx.x * blockDim.x + threadIdx.x; 214 e < nelem; 215 e += blockDim.x * gridDim.x) 216 for (int i = 0; i < BASIS_Q1D; ++i) 217 for (int j = 0; j < BASIS_Q1D; ++j) 218 for (int k = 0; k < BASIS_Q1D; ++k) { 219 const int ind = e*BASIS_Q1D*BASIS_Q1D*BASIS_Q1D + i + j*BASIS_Q1D + 220 k*BASIS_Q1D*BASIS_Q1D; // sequential 221 w[ind] = w1d[i]*w1d[j]*w1d[k]; 222 } 223 } 224 225 //------------------------------------------------------------------------------ 226 // Quadrature weights 227 //------------------------------------------------------------------------------ 228 extern "C" __global__ void weight(const CeedInt nelem, 229 const CeedScalar *__restrict__ qweight1d, 230 CeedScalar *__restrict__ v) { 231 if (BASIS_DIM==1) 232 weight1d(nelem, qweight1d, v); 233 else if (BASIS_DIM==2) 234 weight2d(nelem, qweight1d, v); 235 else if (BASIS_DIM==3) 236 weight3d(nelem, qweight1d, v); 237 } 238 239 ); 240 241 //------------------------------------------------------------------------------ 242 // Non-Tensor Basis Kernels 243 //------------------------------------------------------------------------------ 244 static const char *kernelsNonTensorRef = QUOTE( 245 246 //------------------------------------------------------------------------------ 247 // Interp 248 //------------------------------------------------------------------------------ 249 extern "C" __global__ void interp(const CeedInt nelem, const int transpose, 250 const CeedScalar *d_B, 251 const CeedScalar *__restrict__ d_U, 252 CeedScalar *__restrict__ d_V) { 253 const int tid = threadIdx.x; 254 255 const CeedScalar *U; 256 CeedScalar V; 257 //TODO load B in shared memory if blockDim.z > 1? 258 259 for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; 260 elem += gridDim.x*blockDim.z) { 261 for (int comp = 0; comp < BASIS_NCOMP; comp++) { 262 if (!transpose) { // run with Q threads 263 U = d_U + elem*P + comp*nelem*P; 264 V = 0.0; 265 for (int i = 0; i < P; ++i) 266 V += d_B[i + tid*P]*U[i]; 267 268 d_V[elem*Q + comp*nelem*Q + tid] = V; 269 } else { // run with P threads 270 U = d_U + elem*Q + comp*nelem*Q; 271 V = 0.0; 272 for (int i = 0; i < Q; ++i) 273 V += d_B[tid + i*P]*U[i]; 274 275 d_V[elem*P + comp*nelem*P + tid] = V; 276 } 277 } 278 } 279 } 280 281 //------------------------------------------------------------------------------ 282 // Grad 283 //------------------------------------------------------------------------------ 284 extern "C" __global__ void grad(const CeedInt nelem, const int transpose, 285 const CeedScalar *d_G, 286 const CeedScalar *__restrict__ d_U, 287 CeedScalar *__restrict__ d_V) { 288 const int tid = threadIdx.x; 289 290 const CeedScalar *U; 291 //TODO load G in shared memory if blockDim.z > 1? 292 293 for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; 294 elem += gridDim.x*blockDim.z) { 295 for (int comp=0; comp<BASIS_NCOMP; comp++) { 296 if (!transpose) { // run with Q threads 297 CeedScalar V[BASIS_DIM]; 298 U = d_U + elem*P + comp*nelem*P; 299 for (int dim = 0; dim < BASIS_DIM; dim++) 300 V[dim] = 0.0; 301 302 for (int i = 0; i < P; ++i) { 303 const CeedScalar val = U[i]; 304 for(int dim = 0; dim < BASIS_DIM; dim++) 305 V[dim] += d_G[i + tid*P + dim*P*Q]*val; 306 } 307 for (int dim = 0; dim < BASIS_DIM; dim++) { 308 d_V[elem*Q + comp*nelem*Q + dim*BASIS_NCOMP*nelem*Q + tid] = V[dim]; 309 } 310 } else { // run with P threads 311 CeedScalar V = 0.0; 312 for (int dim = 0; dim < BASIS_DIM; dim++) { 313 U = d_U + elem*Q + comp*nelem*Q +dim*BASIS_NCOMP*nelem*Q; 314 for (int i = 0; i < Q; ++i) 315 V += d_G[tid + i*P + dim*P*Q]*U[i]; 316 } 317 d_V[elem*P + comp*nelem*P + tid] = V; 318 } 319 } 320 } 321 } 322 323 //------------------------------------------------------------------------------ 324 // Weight 325 //------------------------------------------------------------------------------ 326 extern "C" __global__ void weight(const CeedInt nelem, 327 const CeedScalar *__restrict__ qweight, 328 CeedScalar *__restrict__ d_V) { 329 const int tid = threadIdx.x; 330 //TODO load qweight in shared memory if blockDim.z > 1? 331 for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; 332 elem += gridDim.x*blockDim.z) { 333 d_V[elem*Q + tid] = qweight[tid]; 334 } 335 } 336 337 ); 338 // *INDENT-ON* 339 340 //------------------------------------------------------------------------------ 341 // Basis apply - tensor 342 //------------------------------------------------------------------------------ 343 int CeedBasisApply_Hip(CeedBasis basis, const CeedInt nelem, 344 CeedTransposeMode tmode, 345 CeedEvalMode emode, CeedVector u, CeedVector v) { 346 int ierr; 347 Ceed ceed; 348 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 349 Ceed_Hip *ceed_Hip; 350 ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); 351 CeedBasis_Hip *data; 352 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 353 const CeedInt transpose = tmode == CEED_TRANSPOSE; 354 const int maxblocksize = 64; 355 356 // Read vectors 357 const CeedScalar *d_u; 358 CeedScalar *d_v; 359 if (emode != CEED_EVAL_WEIGHT) { 360 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 361 } 362 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 363 364 // Clear v for transpose operation 365 if (tmode == CEED_TRANSPOSE) { 366 CeedInt length; 367 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 368 ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar)); 369 CeedChk_Hip(ceed,ierr); 370 } 371 372 // Basis action 373 switch (emode) { 374 case CEED_EVAL_INTERP: { 375 void *interpargs[] = {(void *) &nelem, (void *) &transpose, 376 &data->d_interp1d, &d_u, &d_v 377 }; 378 CeedInt Q1d, dim; 379 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 380 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 381 CeedInt blocksize = CeedIntPow(Q1d, dim); 382 blocksize = blocksize > maxblocksize ? maxblocksize : blocksize; 383 384 ierr = CeedRunKernelHip(ceed, data->interp, nelem, blocksize, interpargs); 385 CeedChkBackend(ierr); 386 } break; 387 case CEED_EVAL_GRAD: { 388 void *gradargs[] = {(void *) &nelem, (void *) &transpose, &data->d_interp1d, 389 &data->d_grad1d, &d_u, &d_v 390 }; 391 CeedInt blocksize = maxblocksize; 392 393 ierr = CeedRunKernelHip(ceed, data->grad, nelem, blocksize, gradargs); 394 CeedChkBackend(ierr); 395 } break; 396 case CEED_EVAL_WEIGHT: { 397 void *weightargs[] = {(void *) &nelem, (void *) &data->d_qweight1d, &d_v}; 398 const int blocksize = 64; 399 int gridsize = nelem/blocksize; 400 if (blocksize * gridsize < nelem) 401 gridsize += 1; 402 403 ierr = CeedRunKernelHip(ceed, data->weight, gridsize, blocksize, 404 weightargs); CeedChkBackend(ierr); 405 } break; 406 // LCOV_EXCL_START 407 // Evaluate the divergence to/from the quadrature points 408 case CEED_EVAL_DIV: 409 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 410 // Evaluate the curl to/from the quadrature points 411 case CEED_EVAL_CURL: 412 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 413 // Take no action, BasisApply should not have been called 414 case CEED_EVAL_NONE: 415 return CeedError(ceed, CEED_ERROR_BACKEND, 416 "CEED_EVAL_NONE does not make sense in this context"); 417 // LCOV_EXCL_STOP 418 } 419 420 // Restore vectors 421 if (emode != CEED_EVAL_WEIGHT) { 422 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 423 } 424 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 425 return CEED_ERROR_SUCCESS; 426 } 427 428 //------------------------------------------------------------------------------ 429 // Basis apply - non-tensor 430 //------------------------------------------------------------------------------ 431 int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt nelem, 432 CeedTransposeMode tmode, CeedEvalMode emode, 433 CeedVector u, CeedVector v) { 434 int ierr; 435 Ceed ceed; 436 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 437 Ceed_Hip *ceed_Hip; 438 ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); 439 CeedBasisNonTensor_Hip *data; 440 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 441 CeedInt nnodes, nqpt; 442 ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr); 443 ierr = CeedBasisGetNumNodes(basis, &nnodes); CeedChkBackend(ierr); 444 const CeedInt transpose = tmode == CEED_TRANSPOSE; 445 int elemsPerBlock = 1; 446 int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0); 447 448 // Read vectors 449 const CeedScalar *d_u; 450 CeedScalar *d_v; 451 if (emode != CEED_EVAL_WEIGHT) { 452 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 453 } 454 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 455 456 // Clear v for transpose operation 457 if (tmode == CEED_TRANSPOSE) { 458 CeedInt length; 459 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 460 ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar)); 461 CeedChk_Hip(ceed, ierr); 462 } 463 464 // Apply basis operation 465 switch (emode) { 466 case CEED_EVAL_INTERP: { 467 void *interpargs[] = {(void *) &nelem, (void *) &transpose, 468 &data->d_interp, &d_u, &d_v 469 }; 470 if (!transpose) { 471 ierr = CeedRunKernelDimHip(ceed, data->interp, grid, nqpt, 1, 472 elemsPerBlock, interpargs); CeedChkBackend(ierr); 473 } else { 474 ierr = CeedRunKernelDimHip(ceed, data->interp, grid, nnodes, 1, 475 elemsPerBlock, interpargs); CeedChkBackend(ierr); 476 } 477 } break; 478 case CEED_EVAL_GRAD: { 479 void *gradargs[] = {(void *) &nelem, (void *) &transpose, &data->d_grad, 480 &d_u, &d_v 481 }; 482 if (!transpose) { 483 ierr = CeedRunKernelDimHip(ceed, data->grad, grid, nqpt, 1, 484 elemsPerBlock, gradargs); CeedChkBackend(ierr); 485 } else { 486 ierr = CeedRunKernelDimHip(ceed, data->grad, grid, nnodes, 1, 487 elemsPerBlock, gradargs); CeedChkBackend(ierr); 488 } 489 } break; 490 case CEED_EVAL_WEIGHT: { 491 void *weightargs[] = {(void *) &nelem, (void *) &data->d_qweight, &d_v}; 492 ierr = CeedRunKernelDimHip(ceed, data->weight, grid, nqpt, 1, 493 elemsPerBlock, weightargs); CeedChkBackend(ierr); 494 } break; 495 // LCOV_EXCL_START 496 // Evaluate the divergence to/from the quadrature points 497 case CEED_EVAL_DIV: 498 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 499 // Evaluate the curl to/from the quadrature points 500 case CEED_EVAL_CURL: 501 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 502 // Take no action, BasisApply should not have been called 503 case CEED_EVAL_NONE: 504 return CeedError(ceed, CEED_ERROR_BACKEND, 505 "CEED_EVAL_NONE does not make sense in this context"); 506 // LCOV_EXCL_STOP 507 } 508 509 // Restore vectors 510 if (emode != CEED_EVAL_WEIGHT) { 511 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 512 } 513 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 514 return CEED_ERROR_SUCCESS; 515 } 516 517 //------------------------------------------------------------------------------ 518 // Destroy tensor basis 519 //------------------------------------------------------------------------------ 520 static int CeedBasisDestroy_Hip(CeedBasis basis) { 521 int ierr; 522 Ceed ceed; 523 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 524 525 CeedBasis_Hip *data; 526 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 527 528 CeedChk_Hip(ceed, hipModuleUnload(data->module)); 529 530 ierr = hipFree(data->d_qweight1d); CeedChk_Hip(ceed,ierr); 531 ierr = hipFree(data->d_interp1d); CeedChk_Hip(ceed,ierr); 532 ierr = hipFree(data->d_grad1d); CeedChk_Hip(ceed,ierr); 533 534 ierr = CeedFree(&data); CeedChkBackend(ierr); 535 return CEED_ERROR_SUCCESS; 536 } 537 538 //------------------------------------------------------------------------------ 539 // Destroy non-tensor basis 540 //------------------------------------------------------------------------------ 541 static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { 542 int ierr; 543 Ceed ceed; 544 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 545 546 CeedBasisNonTensor_Hip *data; 547 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 548 549 CeedChk_Hip(ceed, hipModuleUnload(data->module)); 550 551 ierr = hipFree(data->d_qweight); CeedChk_Hip(ceed, ierr); 552 ierr = hipFree(data->d_interp); CeedChk_Hip(ceed, ierr); 553 ierr = hipFree(data->d_grad); CeedChk_Hip(ceed, ierr); 554 555 ierr = CeedFree(&data); CeedChkBackend(ierr); 556 return CEED_ERROR_SUCCESS; 557 } 558 559 //------------------------------------------------------------------------------ 560 // Create tensor 561 //------------------------------------------------------------------------------ 562 int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P1d, CeedInt Q1d, 563 const CeedScalar *interp1d, 564 const CeedScalar *grad1d, 565 const CeedScalar *qref1d, 566 const CeedScalar *qweight1d, 567 CeedBasis basis) { 568 int ierr; 569 Ceed ceed; 570 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 571 CeedBasis_Hip *data; 572 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 573 574 // Copy data to GPU 575 const CeedInt qBytes = Q1d * sizeof(CeedScalar); 576 ierr = hipMalloc((void **)&data->d_qweight1d, qBytes); CeedChk_Hip(ceed,ierr); 577 ierr = hipMemcpy(data->d_qweight1d, qweight1d, qBytes, 578 hipMemcpyHostToDevice); CeedChk_Hip(ceed,ierr); 579 580 const CeedInt iBytes = qBytes * P1d; 581 ierr = hipMalloc((void **)&data->d_interp1d, iBytes); CeedChk_Hip(ceed,ierr); 582 ierr = hipMemcpy(data->d_interp1d, interp1d, iBytes, 583 hipMemcpyHostToDevice); CeedChk_Hip(ceed,ierr); 584 585 ierr = hipMalloc((void **)&data->d_grad1d, iBytes); CeedChk_Hip(ceed,ierr); 586 ierr = hipMemcpy(data->d_grad1d, grad1d, iBytes, 587 hipMemcpyHostToDevice); CeedChk_Hip(ceed,ierr); 588 589 // Complie basis kernels 590 CeedInt ncomp; 591 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 592 ierr = CeedCompileHip(ceed, basiskernels, &data->module, 7, 593 "BASIS_Q1D", Q1d, 594 "BASIS_P1D", P1d, 595 "BASIS_BUF_LEN", ncomp * CeedIntPow(Q1d > P1d ? 596 Q1d : P1d, dim), 597 "BASIS_DIM", dim, 598 "BASIS_NCOMP", ncomp, 599 "BASIS_ELEMSIZE", CeedIntPow(P1d, dim), 600 "BASIS_NQPT", CeedIntPow(Q1d, dim) 601 ); CeedChkBackend(ierr); 602 ierr = CeedGetKernelHip(ceed, data->module, "interp", &data->interp); 603 CeedChkBackend(ierr); 604 ierr = CeedGetKernelHip(ceed, data->module, "grad", &data->grad); 605 CeedChkBackend(ierr); 606 ierr = CeedGetKernelHip(ceed, data->module, "weight", &data->weight); 607 CeedChkBackend(ierr); 608 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 609 610 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 611 CeedBasisApply_Hip); CeedChkBackend(ierr); 612 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 613 CeedBasisDestroy_Hip); CeedChkBackend(ierr); 614 return CEED_ERROR_SUCCESS; 615 } 616 617 //------------------------------------------------------------------------------ 618 // Create non-tensor 619 //------------------------------------------------------------------------------ 620 int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt nnodes, 621 CeedInt nqpts, const CeedScalar *interp, 622 const CeedScalar *grad, const CeedScalar *qref, 623 const CeedScalar *qweight, CeedBasis basis) { 624 int ierr; 625 Ceed ceed; 626 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 627 CeedBasisNonTensor_Hip *data; 628 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 629 630 // Copy basis data to GPU 631 const CeedInt qBytes = nqpts * sizeof(CeedScalar); 632 ierr = hipMalloc((void **)&data->d_qweight, qBytes); CeedChk_Hip(ceed, ierr); 633 ierr = hipMemcpy(data->d_qweight, qweight, qBytes, 634 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 635 636 const CeedInt iBytes = qBytes * nnodes; 637 ierr = hipMalloc((void **)&data->d_interp, iBytes); CeedChk_Hip(ceed, ierr); 638 ierr = hipMemcpy(data->d_interp, interp, iBytes, 639 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 640 641 const CeedInt gBytes = qBytes * nnodes * dim; 642 ierr = hipMalloc((void **)&data->d_grad, gBytes); CeedChk_Hip(ceed, ierr); 643 ierr = hipMemcpy(data->d_grad, grad, gBytes, 644 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 645 646 // Compile basis kernels 647 CeedInt ncomp; 648 ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 649 ierr = CeedCompileHip(ceed, kernelsNonTensorRef, &data->module, 4, 650 "Q", nqpts, 651 "P", nnodes, 652 "BASIS_DIM", dim, 653 "BASIS_NCOMP", ncomp 654 ); CeedChk_Hip(ceed, ierr); 655 ierr = CeedGetKernelHip(ceed, data->module, "interp", &data->interp); 656 CeedChk_Hip(ceed, ierr); 657 ierr = CeedGetKernelHip(ceed, data->module, "grad", &data->grad); 658 CeedChk_Hip(ceed, ierr); 659 ierr = CeedGetKernelHip(ceed, data->module, "weight", &data->weight); 660 CeedChk_Hip(ceed, ierr); 661 662 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 663 664 // Register backend functions 665 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 666 CeedBasisApplyNonTensor_Hip); CeedChkBackend(ierr); 667 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 668 CeedBasisDestroyNonTensor_Hip); CeedChkBackend(ierr); 669 return CEED_ERROR_SUCCESS; 670 } 671 //------------------------------------------------------------------------------ 672