13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 37f5b9731SStan Tomov // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 57f5b9731SStan Tomov // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 77f5b9731SStan Tomov 8*49aac155SJeremy L Thompson #include <ceed.h> 9ec3da8bcSJed Brown #include <ceed/backend.h> 10f6af633fSnbeams #include <ceed/jit-tools.h> 11f6af633fSnbeams #include <string.h> 122b730f8bSJeremy L Thompson 137f5b9731SStan Tomov #include "ceed-magma.h" 14e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP 15f6af633fSnbeams #include "../hip/ceed-hip-common.h" 16f6af633fSnbeams #include "../hip/ceed-hip-compile.h" 17f6af633fSnbeams #else 18f6af633fSnbeams #include "../cuda/ceed-cuda-common.h" 19f6af633fSnbeams #include "../cuda/ceed-cuda-compile.h" 20f6af633fSnbeams #endif 217f5b9731SStan Tomov 227f5b9731SStan Tomov #ifdef __cplusplus 237f5b9731SStan Tomov CEED_INTERN "C" 247f5b9731SStan Tomov #endif 252b730f8bSJeremy L Thompson int 262b730f8bSJeremy L Thompson CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) { 277f5b9731SStan Tomov Ceed ceed; 282b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 29e0582403Sabdelfattah83 CeedInt dim, ncomp, ndof; 302b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 312b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); 322b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof)); 33e0582403Sabdelfattah83 34e0582403Sabdelfattah83 Ceed_Magma *data; 352b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 36e0582403Sabdelfattah83 377f5b9731SStan Tomov const CeedScalar *u; 387f5b9731SStan Tomov CeedScalar *v; 39868539c2SNatalie Beams if (emode != CEED_EVAL_WEIGHT) { 402b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u)); 417f5b9731SStan Tomov } else if (emode != CEED_EVAL_WEIGHT) { 427f5b9731SStan Tomov // LCOV_EXCL_START 432b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 447f5b9731SStan Tomov // LCOV_EXCL_STOP 457f5b9731SStan Tomov } 462b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &v)); 477f5b9731SStan Tomov 487f5b9731SStan Tomov CeedBasis_Magma *impl; 492b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &impl)); 507f5b9731SStan Tomov 517f5b9731SStan Tomov CeedInt P1d, Q1d; 522b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P1d)); 532b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q1d)); 547f5b9731SStan Tomov 552b730f8bSJeremy L Thompson CeedDebug256(ceed, 4, "[CeedBasisApply_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * CeedIntPow(P1d, dim), ncomp); 567f5b9731SStan Tomov 577f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 581f9221feSJeremy L Thompson CeedSize length; 592b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(V, &length)); 6080a9ef05SNatalie Beams if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 612b730f8bSJeremy L Thompson magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)v, length, data->queue); 6280a9ef05SNatalie Beams } else { 632b730f8bSJeremy L Thompson magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)v, length, data->queue); 6480a9ef05SNatalie Beams } 65e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 667f5b9731SStan Tomov } 67f6af633fSnbeams 683513a710Sjeremylt switch (emode) { 693513a710Sjeremylt case CEED_EVAL_INTERP: { 707f5b9731SStan Tomov CeedInt P = P1d, Q = Q1d; 717f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 722b730f8bSJeremy L Thompson P = Q1d; 732b730f8bSJeremy L Thompson Q = P1d; 747f5b9731SStan Tomov } 757f5b9731SStan Tomov 767f5b9731SStan Tomov // Define element sizes for dofs/quad 777f5b9731SStan Tomov CeedInt elquadsize = CeedIntPow(Q1d, dim); 787f5b9731SStan Tomov CeedInt eldofssize = CeedIntPow(P1d, dim); 797f5b9731SStan Tomov 807f5b9731SStan Tomov // E-vector ordering -------------- Q-vector ordering 81868539c2SNatalie Beams // component component 82868539c2SNatalie Beams // elem elem 837f5b9731SStan Tomov // node node 847f5b9731SStan Tomov 857f5b9731SStan Tomov // --- Define strides for NOTRANSPOSE mode: --- 867f5b9731SStan Tomov // Input (u) is E-vector, output (v) is Q-vector 877f5b9731SStan Tomov 887f5b9731SStan Tomov // Element strides 89868539c2SNatalie Beams CeedInt u_elstride = eldofssize; 907f5b9731SStan Tomov CeedInt v_elstride = elquadsize; 917f5b9731SStan Tomov // Component strides 92868539c2SNatalie Beams CeedInt u_compstride = nelem * eldofssize; 937f5b9731SStan Tomov CeedInt v_compstride = nelem * elquadsize; 947f5b9731SStan Tomov 957f5b9731SStan Tomov // --- Swap strides for TRANSPOSE mode: --- 967f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 977f5b9731SStan Tomov // Input (u) is Q-vector, output (v) is E-vector 987f5b9731SStan Tomov // Element strides 99868539c2SNatalie Beams v_elstride = eldofssize; 1007f5b9731SStan Tomov u_elstride = elquadsize; 1017f5b9731SStan Tomov // Component strides 102868539c2SNatalie Beams v_compstride = nelem * eldofssize; 1037f5b9731SStan Tomov u_compstride = nelem * elquadsize; 1047f5b9731SStan Tomov } 1057f5b9731SStan Tomov 106f6af633fSnbeams CeedInt nthreads = 1; 107f6af633fSnbeams CeedInt ntcol = 1; 108f6af633fSnbeams CeedInt shmem = 0; 109f6af633fSnbeams CeedInt maxPQ = CeedIntMax(P, Q); 110f6af633fSnbeams 111f6af633fSnbeams switch (dim) { 112f6af633fSnbeams case 1: 113f6af633fSnbeams nthreads = maxPQ; 114f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); 115f6af633fSnbeams shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q)); 116f6af633fSnbeams shmem += sizeof(CeedScalar) * (P * Q); 117f6af633fSnbeams break; 118f6af633fSnbeams case 2: 119f6af633fSnbeams nthreads = maxPQ; 120f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); 121f6af633fSnbeams shmem += P * Q * sizeof(CeedScalar); // for sT 1222b730f8bSJeremy L Thompson shmem += ntcol * (P * maxPQ * sizeof(CeedScalar)); // for reforming rU we need PxP, and for the intermediate output we need PxQ 123f6af633fSnbeams break; 124f6af633fSnbeams case 3: 125f6af633fSnbeams nthreads = maxPQ * maxPQ; 126f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); 127f6af633fSnbeams shmem += sizeof(CeedScalar) * (P * Q); // for sT 1282b730f8bSJeremy L Thompson shmem += sizeof(CeedScalar) * ntcol * 1292b730f8bSJeremy L Thompson (CeedIntMax(P * P * maxPQ, 130f6af633fSnbeams P * Q * Q)); // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2) 131f6af633fSnbeams } 132f6af633fSnbeams CeedInt grid = (nelem + ntcol - 1) / ntcol; 1332b730f8bSJeremy L Thompson void *args[] = {&impl->dinterp1d, &u, &u_elstride, &u_compstride, &v, &v_elstride, &v_compstride, &nelem}; 134f6af633fSnbeams 135f6af633fSnbeams if (tmode == CEED_TRANSPOSE) { 1362b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid, nthreads, ntcol, 1, shmem, args)); 137f6af633fSnbeams } else { 1382b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid, nthreads, ntcol, 1, shmem, args)); 139f6af633fSnbeams } 1402b730f8bSJeremy L Thompson } break; 1413513a710Sjeremylt case CEED_EVAL_GRAD: { 1427f5b9731SStan Tomov CeedInt P = P1d, Q = Q1d; 1437f5b9731SStan Tomov // In CEED_NOTRANSPOSE mode: 1447f5b9731SStan Tomov // u is (P^dim x nc), column-major layout (nc = ncomp) 1457f5b9731SStan Tomov // v is (Q^dim x nc x dim), column-major layout (nc = ncomp) 1467f5b9731SStan Tomov // In CEED_TRANSPOSE mode, the sizes of u and v are switched. 1477f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 1487f5b9731SStan Tomov P = Q1d, Q = P1d; 1497f5b9731SStan Tomov } 1507f5b9731SStan Tomov 1517f5b9731SStan Tomov // Define element sizes for dofs/quad 1527f5b9731SStan Tomov CeedInt elquadsize = CeedIntPow(Q1d, dim); 1537f5b9731SStan Tomov CeedInt eldofssize = CeedIntPow(P1d, dim); 1547f5b9731SStan Tomov 1557f5b9731SStan Tomov // E-vector ordering -------------- Q-vector ordering 1567f5b9731SStan Tomov // dim 157868539c2SNatalie Beams // component component 158868539c2SNatalie Beams // elem elem 1597f5b9731SStan Tomov // node node 1607f5b9731SStan Tomov 1617f5b9731SStan Tomov // --- Define strides for NOTRANSPOSE mode: --- 1627f5b9731SStan Tomov // Input (u) is E-vector, output (v) is Q-vector 1637f5b9731SStan Tomov 1647f5b9731SStan Tomov // Element strides 165868539c2SNatalie Beams CeedInt u_elstride = eldofssize; 1667f5b9731SStan Tomov CeedInt v_elstride = elquadsize; 1677f5b9731SStan Tomov // Component strides 168868539c2SNatalie Beams CeedInt u_compstride = nelem * eldofssize; 1697f5b9731SStan Tomov CeedInt v_compstride = nelem * elquadsize; 1707f5b9731SStan Tomov // Dimension strides 1717f5b9731SStan Tomov CeedInt u_dimstride = 0; 1727f5b9731SStan Tomov CeedInt v_dimstride = nelem * elquadsize * ncomp; 1737f5b9731SStan Tomov 1747f5b9731SStan Tomov // --- Swap strides for TRANSPOSE mode: --- 1757f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) { 1767f5b9731SStan Tomov // Input (u) is Q-vector, output (v) is E-vector 1777f5b9731SStan Tomov // Element strides 178868539c2SNatalie Beams v_elstride = eldofssize; 1797f5b9731SStan Tomov u_elstride = elquadsize; 1807f5b9731SStan Tomov // Component strides 181868539c2SNatalie Beams v_compstride = nelem * eldofssize; 1827f5b9731SStan Tomov u_compstride = nelem * elquadsize; 1837f5b9731SStan Tomov // Dimension strides 1847f5b9731SStan Tomov v_dimstride = 0; 1857f5b9731SStan Tomov u_dimstride = nelem * elquadsize * ncomp; 1867f5b9731SStan Tomov } 1877f5b9731SStan Tomov 188f6af633fSnbeams CeedInt nthreads = 1; 189f6af633fSnbeams CeedInt ntcol = 1; 190f6af633fSnbeams CeedInt shmem = 0; 191f6af633fSnbeams CeedInt maxPQ = CeedIntMax(P, Q); 192f6af633fSnbeams 193f6af633fSnbeams switch (dim) { 194f6af633fSnbeams case 1: 195f6af633fSnbeams nthreads = maxPQ; 196f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); 197f6af633fSnbeams shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q)); 198f6af633fSnbeams shmem += sizeof(CeedScalar) * (P * Q); 199f6af633fSnbeams break; 200f6af633fSnbeams case 2: 201f6af633fSnbeams nthreads = maxPQ; 202f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); 203f6af633fSnbeams shmem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad 2042b730f8bSJeremy L Thompson shmem += sizeof(CeedScalar) * ntcol * (P * maxPQ); // for reforming rU we need PxP, and for the intermediate output we need PxQ 205f6af633fSnbeams break; 206f6af633fSnbeams case 3: 207f6af633fSnbeams nthreads = maxPQ * maxPQ; 208f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); 209f6af633fSnbeams shmem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad 2102b730f8bSJeremy L Thompson shmem += sizeof(CeedScalar) * ntcol * 2112b730f8bSJeremy L Thompson CeedIntMax(P * P * P, 2122b730f8bSJeremy L Thompson (P * P * Q) + (P * Q * Q)); // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2) 213f6af633fSnbeams } 214f6af633fSnbeams CeedInt grid = (nelem + ntcol - 1) / ntcol; 2152b730f8bSJeremy L Thompson void *args[] = {&impl->dinterp1d, &impl->dgrad1d, &u, &u_elstride, &u_compstride, &u_dimstride, &v, 2162b730f8bSJeremy L Thompson &v_elstride, &v_compstride, &v_dimstride, &nelem}; 217f6af633fSnbeams 218f6af633fSnbeams if (tmode == CEED_TRANSPOSE) { 2192b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid, nthreads, ntcol, 1, shmem, args)); 220f6af633fSnbeams } else { 2212b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid, nthreads, ntcol, 1, shmem, args)); 222f6af633fSnbeams } 2232b730f8bSJeremy L Thompson } break; 2243513a710Sjeremylt case CEED_EVAL_WEIGHT: { 2257f5b9731SStan Tomov if (tmode == CEED_TRANSPOSE) 2267f5b9731SStan Tomov // LCOV_EXCL_START 2272b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 2287f5b9731SStan Tomov // LCOV_EXCL_STOP 2297f5b9731SStan Tomov CeedInt Q = Q1d; 230f6af633fSnbeams CeedInt eldofssize = CeedIntPow(Q, dim); 231f6af633fSnbeams CeedInt nthreads = 1; 232f6af633fSnbeams CeedInt ntcol = 1; 233f6af633fSnbeams CeedInt shmem = 0; 234f6af633fSnbeams 235f6af633fSnbeams switch (dim) { 236f6af633fSnbeams case 1: 237f6af633fSnbeams nthreads = Q; 238f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); 239f6af633fSnbeams shmem += sizeof(CeedScalar) * Q; // for dqweight1d 240f6af633fSnbeams shmem += sizeof(CeedScalar) * ntcol * Q; // for output 241f6af633fSnbeams break; 242f6af633fSnbeams case 2: 243f6af633fSnbeams nthreads = Q; 244f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); 245f6af633fSnbeams shmem += sizeof(CeedScalar) * Q; // for dqweight1d 246f6af633fSnbeams break; 247f6af633fSnbeams case 3: 248f6af633fSnbeams nthreads = Q * Q; 249f6af633fSnbeams ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); 250f6af633fSnbeams shmem += sizeof(CeedScalar) * Q; // for dqweight1d 251f6af633fSnbeams } 252f6af633fSnbeams CeedInt grid = (nelem + ntcol - 1) / ntcol; 253f6af633fSnbeams void *args[] = {&impl->dqweight1d, &v, &eldofssize, &nelem}; 254f6af633fSnbeams 2552b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid, nthreads, ntcol, 1, shmem, args)); 2562b730f8bSJeremy L Thompson } break; 2573513a710Sjeremylt // LCOV_EXCL_START 2583513a710Sjeremylt case CEED_EVAL_DIV: 259e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 2603513a710Sjeremylt case CEED_EVAL_CURL: 261e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 2623513a710Sjeremylt case CEED_EVAL_NONE: 2632b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 2643513a710Sjeremylt // LCOV_EXCL_STOP 2653513a710Sjeremylt } 2667f5b9731SStan Tomov 267e0582403Sabdelfattah83 // must sync to ensure completeness 268e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 269e0582403Sabdelfattah83 2707f5b9731SStan Tomov if (emode != CEED_EVAL_WEIGHT) { 2712b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(U, &u)); 2727f5b9731SStan Tomov } 2732b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(V, &v)); 274e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2757f5b9731SStan Tomov } 2767f5b9731SStan Tomov 2777f5b9731SStan Tomov #ifdef __cplusplus 2787f5b9731SStan Tomov CEED_INTERN "C" 2797f5b9731SStan Tomov #endif 2802b730f8bSJeremy L Thompson int 281023b8a51Sabdelfattah83 CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) { 282868539c2SNatalie Beams Ceed ceed; 2832b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 284e0582403Sabdelfattah83 285e0582403Sabdelfattah83 Ceed_Magma *data; 2862b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 287e0582403Sabdelfattah83 288023b8a51Sabdelfattah83 magma_int_t arch = magma_getdevice_arch(); 289023b8a51Sabdelfattah83 290868539c2SNatalie Beams CeedInt dim, ncomp, ndof, nqpt; 2912b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 2922b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); 2932b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof)); 2942b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &nqpt)); 295868539c2SNatalie Beams const CeedScalar *du; 296868539c2SNatalie Beams CeedScalar *dv; 297868539c2SNatalie Beams if (emode != CEED_EVAL_WEIGHT) { 2982b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du)); 299868539c2SNatalie Beams } else if (emode != CEED_EVAL_WEIGHT) { 300868539c2SNatalie Beams // LCOV_EXCL_START 3012b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 302868539c2SNatalie Beams // LCOV_EXCL_STOP 303868539c2SNatalie Beams } 3042b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv)); 305868539c2SNatalie Beams 306868539c2SNatalie Beams CeedBasisNonTensor_Magma *impl; 3072b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &impl)); 308868539c2SNatalie Beams 3092b730f8bSJeremy L Thompson CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * ndof, ncomp); 310868539c2SNatalie Beams 311868539c2SNatalie Beams if (tmode == CEED_TRANSPOSE) { 3121f9221feSJeremy L Thompson CeedSize length; 3132b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(V, &length)); 31480a9ef05SNatalie Beams if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 3152b730f8bSJeremy L Thompson magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue); 31680a9ef05SNatalie Beams } else { 3172b730f8bSJeremy L Thompson magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue); 31880a9ef05SNatalie Beams } 319e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 320868539c2SNatalie Beams } 32180a9ef05SNatalie Beams 322023b8a51Sabdelfattah83 CeedInt P = ndof, Q = nqpt, N = nelem * ncomp; 323023b8a51Sabdelfattah83 CeedInt NB = 1; 324023b8a51Sabdelfattah83 CeedMagmaFunction *interp, *grad; 325868539c2SNatalie Beams 326023b8a51Sabdelfattah83 CeedInt Narray[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES}; 327023b8a51Sabdelfattah83 CeedInt iN = 0; 328023b8a51Sabdelfattah83 CeedInt diff = abs(Narray[iN] - N); 329023b8a51Sabdelfattah83 for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 330023b8a51Sabdelfattah83 CeedInt idiff = abs(Narray[in] - N); 331023b8a51Sabdelfattah83 if (idiff < diff) { 332023b8a51Sabdelfattah83 iN = in; 333023b8a51Sabdelfattah83 diff = idiff; 334868539c2SNatalie Beams } 33580a9ef05SNatalie Beams } 33680a9ef05SNatalie Beams 337023b8a51Sabdelfattah83 NB = nontensor_rtc_get_nb(arch, 'd', emode, tmode, P, Narray[iN], Q); 338023b8a51Sabdelfattah83 interp = (tmode == CEED_TRANSPOSE) ? &impl->magma_interp_tr_nontensor[iN] : &impl->magma_interp_nontensor[iN]; 339023b8a51Sabdelfattah83 grad = (tmode == CEED_TRANSPOSE) ? &impl->magma_grad_tr_nontensor[iN] : &impl->magma_grad_nontensor[iN]; 34080a9ef05SNatalie Beams 34180a9ef05SNatalie Beams switch (emode) { 34280a9ef05SNatalie Beams case CEED_EVAL_INTERP: { 34380a9ef05SNatalie Beams CeedInt P = ndof, Q = nqpt; 344023b8a51Sabdelfattah83 if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { 345023b8a51Sabdelfattah83 CeedInt M = (tmode == CEED_TRANSPOSE) ? P : Q; 346023b8a51Sabdelfattah83 CeedInt K = (tmode == CEED_TRANSPOSE) ? Q : P; 347023b8a51Sabdelfattah83 CeedInt ntcol = MAGMA_NONTENSOR_BASIS_NTCOL(M); 348023b8a51Sabdelfattah83 CeedInt shmem = 0, shmemA = 0, shmemB = 0; 349023b8a51Sabdelfattah83 shmemB += ntcol * K * NB * sizeof(CeedScalar); 350023b8a51Sabdelfattah83 shmemA += (tmode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar); 351023b8a51Sabdelfattah83 shmem = (tmode == CEED_TRANSPOSE) ? (shmemA + shmemB) : CeedIntMax(shmemA, shmemB); 352023b8a51Sabdelfattah83 353023b8a51Sabdelfattah83 CeedInt grid = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), ntcol); 354023b8a51Sabdelfattah83 magma_trans_t transA = (tmode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans; 355023b8a51Sabdelfattah83 magma_trans_t transB = MagmaNoTrans; 356023b8a51Sabdelfattah83 CeedScalar alpha = 1.0, beta = 0.0; 357023b8a51Sabdelfattah83 358023b8a51Sabdelfattah83 void *args[] = {&transA, &transB, &N, &alpha, &impl->dinterp, &P, &du, &K, &beta, &dv, &M}; 359023b8a51Sabdelfattah83 CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *interp, grid, M, ntcol, 1, shmem, args)); 360023b8a51Sabdelfattah83 } else { 36180a9ef05SNatalie Beams if (tmode == CEED_TRANSPOSE) 362023b8a51Sabdelfattah83 magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, impl->dinterp, P, du, Q, 0.0, dv, P, data->queue); 363023b8a51Sabdelfattah83 else magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, impl->dinterp, P, du, P, 0.0, dv, Q, data->queue); 364023b8a51Sabdelfattah83 } 3652b730f8bSJeremy L Thompson } break; 36680a9ef05SNatalie Beams 36780a9ef05SNatalie Beams case CEED_EVAL_GRAD: { 36880a9ef05SNatalie Beams CeedInt P = ndof, Q = nqpt; 369023b8a51Sabdelfattah83 if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { 370023b8a51Sabdelfattah83 CeedInt M = (tmode == CEED_TRANSPOSE) ? P : Q; 371023b8a51Sabdelfattah83 CeedInt K = (tmode == CEED_TRANSPOSE) ? Q : P; 372023b8a51Sabdelfattah83 CeedInt ntcol = MAGMA_NONTENSOR_BASIS_NTCOL(M); 373023b8a51Sabdelfattah83 CeedInt shmem = 0, shmemA = 0, shmemB = 0; 374023b8a51Sabdelfattah83 shmemB += ntcol * K * NB * sizeof(CeedScalar); 375023b8a51Sabdelfattah83 shmemA += (tmode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar); 376023b8a51Sabdelfattah83 shmem = shmemA + shmemB; 377023b8a51Sabdelfattah83 378023b8a51Sabdelfattah83 CeedInt grid = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), ntcol); 379023b8a51Sabdelfattah83 magma_trans_t transA = (tmode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans; 380023b8a51Sabdelfattah83 magma_trans_t transB = MagmaNoTrans; 381023b8a51Sabdelfattah83 382023b8a51Sabdelfattah83 void *args[] = {&transA, &transB, &N, &impl->dgrad, &P, &du, &K, &dv, &M}; 383023b8a51Sabdelfattah83 CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *grad, grid, M, ntcol, 1, shmem, args)); 384023b8a51Sabdelfattah83 } else { 38580a9ef05SNatalie Beams if (tmode == CEED_TRANSPOSE) { 38680a9ef05SNatalie Beams CeedScalar beta = 0.0; 38780a9ef05SNatalie Beams for (int d = 0; d < dim; d++) { 3882b730f8bSJeremy L Thompson if (d > 0) beta = 1.0; 389023b8a51Sabdelfattah83 magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, impl->dgrad + d * P * Q, P, du + d * nelem * ncomp * Q, Q, 390023b8a51Sabdelfattah83 beta, dv, P, data->queue); 39180a9ef05SNatalie Beams } 39280a9ef05SNatalie Beams } else { 39380a9ef05SNatalie Beams for (int d = 0; d < dim; d++) 394023b8a51Sabdelfattah83 magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, impl->dgrad + d * P * Q, P, du, P, 0.0, 395023b8a51Sabdelfattah83 dv + d * nelem * ncomp * Q, Q, data->queue); 396023b8a51Sabdelfattah83 } 397868539c2SNatalie Beams } 3982b730f8bSJeremy L Thompson } break; 399868539c2SNatalie Beams 400868539c2SNatalie Beams case CEED_EVAL_WEIGHT: { 401868539c2SNatalie Beams if (tmode == CEED_TRANSPOSE) 402868539c2SNatalie Beams // LCOV_EXCL_START 4032b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 404868539c2SNatalie Beams // LCOV_EXCL_STOP 405868539c2SNatalie Beams 406868539c2SNatalie Beams int elemsPerBlock = 1; // basis->Q1d < 7 ? optElems[basis->Q1d] : 1; 4072b730f8bSJeremy L Thompson int grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); 4082b730f8bSJeremy L Thompson magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, data->queue); 4092b730f8bSJeremy L Thompson } break; 410868539c2SNatalie Beams 411868539c2SNatalie Beams // LCOV_EXCL_START 412868539c2SNatalie Beams case CEED_EVAL_DIV: 413e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 414868539c2SNatalie Beams case CEED_EVAL_CURL: 415e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 416868539c2SNatalie Beams case CEED_EVAL_NONE: 4172b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 418868539c2SNatalie Beams // LCOV_EXCL_STOP 419868539c2SNatalie Beams } 420868539c2SNatalie Beams 421e0582403Sabdelfattah83 // must sync to ensure completeness 422e0582403Sabdelfattah83 ceed_magma_queue_sync(data->queue); 423e0582403Sabdelfattah83 424868539c2SNatalie Beams if (emode != CEED_EVAL_WEIGHT) { 4252b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(U, &du)); 426868539c2SNatalie Beams } 4272b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(V, &dv)); 428e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 429868539c2SNatalie Beams } 430868539c2SNatalie Beams 431868539c2SNatalie Beams #ifdef __cplusplus 432868539c2SNatalie Beams CEED_INTERN "C" 433868539c2SNatalie Beams #endif 4342b730f8bSJeremy L Thompson int 4352b730f8bSJeremy L Thompson CeedBasisDestroy_Magma(CeedBasis basis) { 4367f5b9731SStan Tomov CeedBasis_Magma *impl; 4372b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &impl)); 4387f5b9731SStan Tomov 4392b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dqref1d)); 4402b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dinterp1d)); 4412b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dgrad1d)); 4422b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dqweight1d)); 443f6af633fSnbeams Ceed ceed; 4442b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 445e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP 4462b730f8bSJeremy L Thompson CeedCallHip(ceed, hipModuleUnload(impl->module)); 447f6af633fSnbeams #else 4482b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(impl->module)); 449f6af633fSnbeams #endif 4507f5b9731SStan Tomov 4512b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl)); 4527f5b9731SStan Tomov 453e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 4547f5b9731SStan Tomov } 4557f5b9731SStan Tomov 4567f5b9731SStan Tomov #ifdef __cplusplus 4577f5b9731SStan Tomov CEED_INTERN "C" 4587f5b9731SStan Tomov #endif 4592b730f8bSJeremy L Thompson int 4602b730f8bSJeremy L Thompson CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { 461868539c2SNatalie Beams CeedBasisNonTensor_Magma *impl; 4622b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &impl)); 463868539c2SNatalie Beams 4642b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dqref)); 4652b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dinterp)); 4662b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dgrad)); 4672b730f8bSJeremy L Thompson CeedCallBackend(magma_free(impl->dqweight)); 468023b8a51Sabdelfattah83 Ceed ceed; 469023b8a51Sabdelfattah83 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 470023b8a51Sabdelfattah83 #ifdef CEED_MAGMA_USE_HIP 471023b8a51Sabdelfattah83 for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 472023b8a51Sabdelfattah83 CeedCallHip(ceed, hipModuleUnload(impl->module[in])); 473023b8a51Sabdelfattah83 } 474023b8a51Sabdelfattah83 #else 475023b8a51Sabdelfattah83 for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 476023b8a51Sabdelfattah83 CeedCallCuda(ceed, cuModuleUnload(impl->module[in])); 477023b8a51Sabdelfattah83 } 478023b8a51Sabdelfattah83 #endif 4792b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl)); 480868539c2SNatalie Beams 481e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 482868539c2SNatalie Beams } 483868539c2SNatalie Beams 484868539c2SNatalie Beams #ifdef __cplusplus 485868539c2SNatalie Beams CEED_INTERN "C" 486868539c2SNatalie Beams #endif 4872b730f8bSJeremy L Thompson int 4882b730f8bSJeremy L Thompson CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, 4892b730f8bSJeremy L Thompson const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis) { 4907f5b9731SStan Tomov CeedBasis_Magma *impl; 4912b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl)); 4927f5b9731SStan Tomov Ceed ceed; 4932b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 4947f5b9731SStan Tomov 495c9f8acf2SJeremy L Thompson // Check for supported parameters 496c9f8acf2SJeremy L Thompson CeedInt ncomp = 0; 4972b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); 498e0582403Sabdelfattah83 Ceed_Magma *data; 4992b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 500e0582403Sabdelfattah83 501f6af633fSnbeams // Compile kernels 502f6af633fSnbeams char *magma_common_path; 503f6af633fSnbeams char *interp_path, *grad_path, *weight_path; 504f6af633fSnbeams char *basis_kernel_source; 505023b8a51Sabdelfattah83 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path)); 506f6af633fSnbeams CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 5072b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source)); 508023b8a51Sabdelfattah83 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_tensor.h", &magma_common_path)); 509023b8a51Sabdelfattah83 CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source)); 510f6af633fSnbeams char *interp_name_base = "ceed/jit-source/magma/interp"; 511f6af633fSnbeams CeedInt interp_name_len = strlen(interp_name_base) + 6; 512f6af633fSnbeams char interp_name[interp_name_len]; 5132b730f8bSJeremy L Thompson snprintf(interp_name, interp_name_len, "%s-%" CeedInt_FMT "d.h", interp_name_base, dim); 5142b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_name, &interp_path)); 5152b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source)); 516f6af633fSnbeams char *grad_name_base = "ceed/jit-source/magma/grad"; 517f6af633fSnbeams CeedInt grad_name_len = strlen(grad_name_base) + 6; 518f6af633fSnbeams char grad_name[grad_name_len]; 5192b730f8bSJeremy L Thompson snprintf(grad_name, grad_name_len, "%s-%" CeedInt_FMT "d.h", grad_name_base, dim); 5202b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_name, &grad_path)); 5212b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source)); 522f6af633fSnbeams char *weight_name_base = "ceed/jit-source/magma/weight"; 523f6af633fSnbeams CeedInt weight_name_len = strlen(weight_name_base) + 6; 524f6af633fSnbeams char weight_name[weight_name_len]; 5252b730f8bSJeremy L Thompson snprintf(weight_name, weight_name_len, "%s-%" CeedInt_FMT "d.h", weight_name_base, dim); 5262b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_name, &weight_path)); 5272b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_path, &basis_kernel_source)); 5282b730f8bSJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); 529f6af633fSnbeams // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip 530f6af633fSnbeams // data 531f6af633fSnbeams Ceed delegate; 5322b730f8bSJeremy L Thompson CeedCallBackend(CeedGetDelegate(ceed, &delegate)); 5332b730f8bSJeremy L Thompson CeedCallBackend(CeedCompileMagma(delegate, basis_kernel_source, &impl->module, 5, "DIM", dim, "NCOMP", ncomp, "P", P1d, "Q", Q1d, "MAXPQ", 5342b730f8bSJeremy L Thompson CeedIntMax(P1d, Q1d))); 535f6af633fSnbeams 536f6af633fSnbeams // Kernel setup 537f6af633fSnbeams switch (dim) { 538f6af633fSnbeams case 1: 5392b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->magma_interp)); 5402b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->magma_interp_tr)); 5412b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->magma_grad)); 5422b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->magma_grad_tr)); 5432b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->magma_weight)); 544f6af633fSnbeams break; 545f6af633fSnbeams case 2: 5462b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->magma_interp)); 5472b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->magma_interp_tr)); 5482b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->magma_grad)); 5492b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->magma_grad_tr)); 5502b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->magma_weight)); 551f6af633fSnbeams break; 552f6af633fSnbeams case 3: 5532b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->magma_interp)); 5542b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->magma_interp_tr)); 5552b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->magma_grad)); 5562b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->magma_grad_tr)); 5572b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->magma_weight)); 558f6af633fSnbeams } 559f6af633fSnbeams 5602b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma)); 5612b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma)); 5627f5b9731SStan Tomov 5637f5b9731SStan Tomov // Copy qref1d to the GPU 5642b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dqref1d, Q1d * sizeof(qref1d[0]))); 5652b730f8bSJeremy L Thompson magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1, data->queue); 5667f5b9731SStan Tomov 5677f5b9731SStan Tomov // Copy interp1d to the GPU 5682b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dinterp1d, Q1d * P1d * sizeof(interp1d[0]))); 5692b730f8bSJeremy L Thompson magma_setvector(Q1d * P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1, data->queue); 5707f5b9731SStan Tomov 5717f5b9731SStan Tomov // Copy grad1d to the GPU 5722b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dgrad1d, Q1d * P1d * sizeof(grad1d[0]))); 5732b730f8bSJeremy L Thompson magma_setvector(Q1d * P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1, data->queue); 5747f5b9731SStan Tomov 5757f5b9731SStan Tomov // Copy qweight1d to the GPU 5762b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dqweight1d, Q1d * sizeof(qweight1d[0]))); 5772b730f8bSJeremy L Thompson magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1, data->queue); 5787f5b9731SStan Tomov 5792b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, impl)); 5802b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&magma_common_path)); 5812b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&interp_path)); 5822b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&grad_path)); 5832b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&weight_path)); 5842b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_source)); 585f6af633fSnbeams 586e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 5877f5b9731SStan Tomov } 5887f5b9731SStan Tomov 5897f5b9731SStan Tomov #ifdef __cplusplus 5907f5b9731SStan Tomov CEED_INTERN "C" 5917f5b9731SStan Tomov #endif 5922b730f8bSJeremy L Thompson int 5932b730f8bSJeremy L Thompson CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, const CeedScalar *grad, 5942b730f8bSJeremy L Thompson const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis) { 595868539c2SNatalie Beams CeedBasisNonTensor_Magma *impl; 5967f5b9731SStan Tomov Ceed ceed; 5972b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 5987f5b9731SStan Tomov 599e0582403Sabdelfattah83 Ceed_Magma *data; 6002b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 601023b8a51Sabdelfattah83 magma_int_t arch = magma_getdevice_arch(); 6022b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl)); 603023b8a51Sabdelfattah83 // Compile kernels 604023b8a51Sabdelfattah83 char *magma_common_path; 605023b8a51Sabdelfattah83 char *interp_path, *grad_path; 606023b8a51Sabdelfattah83 char *basis_kernel_source; 607023b8a51Sabdelfattah83 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path)); 608023b8a51Sabdelfattah83 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 609023b8a51Sabdelfattah83 CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source)); 610023b8a51Sabdelfattah83 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_nontensor.h", &magma_common_path)); 611023b8a51Sabdelfattah83 CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source)); 612023b8a51Sabdelfattah83 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/interp-nontensor.h", &interp_path)); 613023b8a51Sabdelfattah83 CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source)); 614023b8a51Sabdelfattah83 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/grad-nontensor.h", &grad_path)); 615023b8a51Sabdelfattah83 CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source)); 616023b8a51Sabdelfattah83 617023b8a51Sabdelfattah83 // tuning parameters for nb 618023b8a51Sabdelfattah83 CeedInt nb_interp_n[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 619023b8a51Sabdelfattah83 CeedInt nb_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 620023b8a51Sabdelfattah83 CeedInt nb_grad_n[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 621023b8a51Sabdelfattah83 CeedInt nb_grad_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 622023b8a51Sabdelfattah83 CeedInt P = ndof, Q = nqpts; 623023b8a51Sabdelfattah83 CeedInt Narray[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES}; 624023b8a51Sabdelfattah83 for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 625023b8a51Sabdelfattah83 nb_interp_n[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_NOTRANSPOSE, P, Narray[in], Q); 626023b8a51Sabdelfattah83 nb_interp_t[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_TRANSPOSE, P, Narray[in], Q); 627023b8a51Sabdelfattah83 nb_grad_n[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_NOTRANSPOSE, P, Narray[in], Q); 628023b8a51Sabdelfattah83 nb_grad_t[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_TRANSPOSE, P, Narray[in], Q); 629023b8a51Sabdelfattah83 } 630023b8a51Sabdelfattah83 631023b8a51Sabdelfattah83 // compile 632023b8a51Sabdelfattah83 Ceed delegate; 633023b8a51Sabdelfattah83 CeedCallBackend(CeedGetDelegate(ceed, &delegate)); 634023b8a51Sabdelfattah83 for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 635023b8a51Sabdelfattah83 CeedCallBackend(CeedCompileMagma(delegate, basis_kernel_source, &impl->module[in], 7, "DIM", dim, "P", P, "Q", Q, "NB_INTERP_N", nb_interp_n[in], 636023b8a51Sabdelfattah83 "NB_INTERP_T", nb_interp_t[in], "NB_GRAD_N", nb_grad_n[in], "NB_GRAD_T", nb_grad_t[in])); 637023b8a51Sabdelfattah83 } 638023b8a51Sabdelfattah83 639023b8a51Sabdelfattah83 // get kernels 640023b8a51Sabdelfattah83 for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { 641023b8a51Sabdelfattah83 CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_n", &impl->magma_interp_nontensor[in])); 642023b8a51Sabdelfattah83 CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_t", &impl->magma_interp_tr_nontensor[in])); 643023b8a51Sabdelfattah83 CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_n", &impl->magma_grad_nontensor[in])); 644023b8a51Sabdelfattah83 CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_t", &impl->magma_grad_tr_nontensor[in])); 645023b8a51Sabdelfattah83 } 646023b8a51Sabdelfattah83 647023b8a51Sabdelfattah83 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); 648023b8a51Sabdelfattah83 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); 649868539c2SNatalie Beams 650868539c2SNatalie Beams // Copy qref to the GPU 6512b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dqref, nqpts * sizeof(qref[0]))); 652e0582403Sabdelfattah83 magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue); 653868539c2SNatalie Beams 654868539c2SNatalie Beams // Copy interp to the GPU 6552b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dinterp, nqpts * ndof * sizeof(interp[0]))); 6562b730f8bSJeremy L Thompson magma_setvector(nqpts * ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1, data->queue); 657868539c2SNatalie Beams 658868539c2SNatalie Beams // Copy grad to the GPU 6592b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dgrad, nqpts * ndof * dim * sizeof(grad[0]))); 6602b730f8bSJeremy L Thompson magma_setvector(nqpts * ndof * dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1, data->queue); 661868539c2SNatalie Beams 662868539c2SNatalie Beams // Copy qweight to the GPU 6632b730f8bSJeremy L Thompson CeedCallBackend(magma_malloc((void **)&impl->dqweight, nqpts * sizeof(qweight[0]))); 6642b730f8bSJeremy L Thompson magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1, data->queue); 665868539c2SNatalie Beams 666023b8a51Sabdelfattah83 CeedCallBackend(CeedBasisSetData(basis, impl)); 667023b8a51Sabdelfattah83 CeedCallBackend(CeedFree(&magma_common_path)); 668023b8a51Sabdelfattah83 CeedCallBackend(CeedFree(&interp_path)); 669023b8a51Sabdelfattah83 CeedCallBackend(CeedFree(&grad_path)); 670023b8a51Sabdelfattah83 CeedCallBackend(CeedFree(&basis_kernel_source)); 671e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 6727f5b9731SStan Tomov } 673