xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma-basis.c (revision 13f886e92f3445bcbdaccd4d0093947b9a882fe5)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <string.h>
12 #include "ceed-magma.h"
13 #ifdef CEED_MAGMA_USE_HIP
14 #include "../hip/ceed-hip-common.h"
15 #include "../hip/ceed-hip-compile.h"
16 #else
17 #include "../cuda/ceed-cuda-common.h"
18 #include "../cuda/ceed-cuda-compile.h"
19 #endif
20 
21 #ifdef __cplusplus
22 CEED_INTERN "C"
23 #endif
24 int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem,
25                          CeedTransposeMode tmode, CeedEvalMode emode,
26                          CeedVector U, CeedVector V) {
27   int ierr;
28   Ceed ceed;
29   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
30   CeedInt dim, ncomp, ndof;
31   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
32   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
33   ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr);
34 
35   Ceed_Magma *data;
36   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
37 
38   const CeedScalar *u;
39   CeedScalar *v;
40   if (emode != CEED_EVAL_WEIGHT) {
41     ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u); CeedChkBackend(ierr);
42   } else if (emode != CEED_EVAL_WEIGHT) {
43     // LCOV_EXCL_START
44     return CeedError(ceed, CEED_ERROR_BACKEND,
45                      "An input vector is required for this CeedEvalMode");
46     // LCOV_EXCL_STOP
47   }
48   ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &v); CeedChkBackend(ierr);
49 
50   CeedBasis_Magma *impl;
51   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
52 
53   CeedInt P1d, Q1d;
54   ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
55   ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
56 
57   CeedDebug256(ceed, 4, "[CeedBasisApply_Magma] vsize=%" CeedInt_FMT
58                ", comp = %" CeedInt_FMT, ncomp*CeedIntPow(P1d, dim), ncomp);
59 
60   if (tmode == CEED_TRANSPOSE) {
61     CeedSize length;
62     ierr = CeedVectorGetLength(V, &length); CeedChkBackend(ierr);
63     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
64       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) v, length,
65                        data->queue);
66     } else {
67       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) v, length,
68                        data->queue);
69     }
70     ceed_magma_queue_sync( data->queue );
71   }
72 
73   switch (emode) {
74   case CEED_EVAL_INTERP: {
75     CeedInt P = P1d, Q = Q1d;
76     if (tmode == CEED_TRANSPOSE) {
77       P = Q1d; Q = P1d;
78     }
79 
80     // Define element sizes for dofs/quad
81     CeedInt elquadsize = CeedIntPow(Q1d, dim);
82     CeedInt eldofssize = CeedIntPow(P1d, dim);
83 
84     // E-vector ordering -------------- Q-vector ordering
85     //  component                        component
86     //    elem                             elem
87     //       node                            node
88 
89     // ---  Define strides for NOTRANSPOSE mode: ---
90     // Input (u) is E-vector, output (v) is Q-vector
91 
92     // Element strides
93     CeedInt u_elstride = eldofssize;
94     CeedInt v_elstride = elquadsize;
95     // Component strides
96     CeedInt u_compstride = nelem * eldofssize;
97     CeedInt v_compstride = nelem * elquadsize;
98 
99     // ---  Swap strides for TRANSPOSE mode: ---
100     if (tmode == CEED_TRANSPOSE) {
101       // Input (u) is Q-vector, output (v) is E-vector
102       // Element strides
103       v_elstride = eldofssize;
104       u_elstride = elquadsize;
105       // Component strides
106       v_compstride = nelem * eldofssize;
107       u_compstride = nelem * elquadsize;
108     }
109 
110     CeedInt nthreads = 1;
111     CeedInt ntcol = 1;
112     CeedInt shmem = 0;
113     CeedInt maxPQ = CeedIntMax(P, Q);
114 
115     switch (dim) {
116     case 1:
117       nthreads = maxPQ;
118       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
119       shmem += sizeof(CeedScalar) * ntcol * ( ncomp * (1*P + 1*Q) );
120       shmem += sizeof(CeedScalar) * (P*Q);
121       break;
122     case 2:
123       nthreads = maxPQ;
124       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
125       shmem += P*Q    *sizeof(CeedScalar);  // for sT
126       shmem += ntcol * ( P*maxPQ*sizeof(
127                            CeedScalar) );  // for reforming rU we need PxP, and for the intermediate output we need PxQ
128       break;
129     case 3:
130       nthreads = maxPQ*maxPQ;
131       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
132       shmem += sizeof(CeedScalar)* (P*Q);  // for sT
133       shmem += sizeof(CeedScalar)* ntcol * (CeedIntMax(P*P*maxPQ,
134                                             P*Q*Q));  // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2)
135     }
136     CeedInt grid = (nelem + ntcol-1) / ntcol;
137     void *args[] = {&impl->dinterp1d,
138                     &u, &u_elstride, &u_compstride,
139                     &v, &v_elstride, &v_compstride,
140                     &nelem
141                    };
142 
143     if (tmode == CEED_TRANSPOSE) {
144       ierr = CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid,
145                                          nthreads, ntcol, 1, shmem,
146                                          args); CeedChkBackend(ierr);
147     } else {
148       ierr = CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid,
149                                          nthreads, ntcol, 1, shmem,
150                                          args); CeedChkBackend(ierr);
151     }
152   }
153   break;
154   case CEED_EVAL_GRAD: {
155     CeedInt P = P1d, Q = Q1d;
156     // In CEED_NOTRANSPOSE mode:
157     // u is (P^dim x nc), column-major layout (nc = ncomp)
158     // v is (Q^dim x nc x dim), column-major layout (nc = ncomp)
159     // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
160     if (tmode == CEED_TRANSPOSE) {
161       P = Q1d, Q = P1d;
162     }
163 
164     // Define element sizes for dofs/quad
165     CeedInt elquadsize = CeedIntPow(Q1d, dim);
166     CeedInt eldofssize = CeedIntPow(P1d, dim);
167 
168     // E-vector ordering -------------- Q-vector ordering
169     //                                  dim
170     //  component                        component
171     //    elem                              elem
172     //       node                            node
173 
174     // ---  Define strides for NOTRANSPOSE mode: ---
175     // Input (u) is E-vector, output (v) is Q-vector
176 
177     // Element strides
178     CeedInt u_elstride = eldofssize;
179     CeedInt v_elstride = elquadsize;
180     // Component strides
181     CeedInt u_compstride = nelem * eldofssize;
182     CeedInt v_compstride = nelem * elquadsize;
183     // Dimension strides
184     CeedInt u_dimstride = 0;
185     CeedInt v_dimstride = nelem * elquadsize * ncomp;
186 
187     // ---  Swap strides for TRANSPOSE mode: ---
188     if (tmode == CEED_TRANSPOSE) {
189       // Input (u) is Q-vector, output (v) is E-vector
190       // Element strides
191       v_elstride = eldofssize;
192       u_elstride = elquadsize;
193       // Component strides
194       v_compstride = nelem * eldofssize;
195       u_compstride = nelem * elquadsize;
196       // Dimension strides
197       v_dimstride = 0;
198       u_dimstride = nelem * elquadsize * ncomp;
199 
200     }
201 
202     CeedInt nthreads = 1;
203     CeedInt ntcol = 1;
204     CeedInt shmem = 0;
205     CeedInt maxPQ = CeedIntMax(P, Q);
206 
207     switch (dim) {
208     case 1:
209       nthreads = maxPQ;
210       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
211       shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1*P + 1*Q));
212       shmem += sizeof(CeedScalar) * (P*Q);
213       break;
214     case 2:
215       nthreads = maxPQ;
216       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
217       shmem += sizeof(CeedScalar) * 2*P*Q;  // for sTinterp and sTgrad
218       shmem += sizeof(CeedScalar) * ntcol *
219                (P*maxPQ);  // for reforming rU we need PxP, and for the intermediate output we need PxQ
220       break;
221     case 3:
222       nthreads = maxPQ * maxPQ;
223       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
224       shmem += sizeof(CeedScalar) * 2*P*Q;  // for sTinterp and sTgrad
225       shmem += sizeof(CeedScalar) * ntcol * CeedIntMax(P*P*P,
226                (P*P*Q) +
227                (P*Q*Q));  // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2)
228     }
229     CeedInt grid = (nelem + ntcol-1) / ntcol;
230     void *args[] = {&impl->dinterp1d, &impl->dgrad1d,
231                     &u, &u_elstride, &u_compstride, &u_dimstride,
232                     &v, &v_elstride, &v_compstride, &v_dimstride,
233                     &nelem
234                    };
235 
236     if (tmode == CEED_TRANSPOSE) {
237       ierr = CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid,
238                                          nthreads, ntcol, 1, shmem,
239                                          args); CeedChkBackend(ierr);
240     } else {
241       ierr = CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid,
242                                          nthreads, ntcol, 1, shmem,
243                                          args); CeedChkBackend(ierr);
244     }
245   }
246   break;
247   case CEED_EVAL_WEIGHT: {
248     if (tmode == CEED_TRANSPOSE)
249       // LCOV_EXCL_START
250       return CeedError(ceed, CEED_ERROR_BACKEND,
251                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
252     // LCOV_EXCL_STOP
253     CeedInt Q = Q1d;
254     CeedInt eldofssize = CeedIntPow(Q, dim);
255     CeedInt nthreads = 1;
256     CeedInt ntcol = 1;
257     CeedInt shmem = 0;
258 
259     switch (dim) {
260     case 1:
261       nthreads = Q;
262       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
263       shmem += sizeof(CeedScalar) * Q;  // for dqweight1d
264       shmem += sizeof(CeedScalar) * ntcol * Q; // for output
265       break;
266     case 2:
267       nthreads = Q;
268       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
269       shmem += sizeof(CeedScalar) * Q;  // for dqweight1d
270       break;
271     case 3:
272       nthreads = Q * Q;
273       ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
274       shmem += sizeof(CeedScalar) * Q;  // for dqweight1d
275     }
276     CeedInt grid = (nelem + ntcol-1) / ntcol;
277     void *args[] = {&impl->dqweight1d, &v, &eldofssize, &nelem};
278 
279     ierr = CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid,
280                                        nthreads, ntcol, 1, shmem,
281                                        args); CeedChkBackend(ierr);
282   }
283   break;
284   // LCOV_EXCL_START
285   case CEED_EVAL_DIV:
286     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
287   case CEED_EVAL_CURL:
288     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
289   case CEED_EVAL_NONE:
290     return CeedError(ceed, CEED_ERROR_BACKEND,
291                      "CEED_EVAL_NONE does not make sense in this context");
292     // LCOV_EXCL_STOP
293   }
294 
295   // must sync to ensure completeness
296   ceed_magma_queue_sync( data->queue );
297 
298   if (emode!=CEED_EVAL_WEIGHT) {
299     ierr = CeedVectorRestoreArrayRead(U, &u); CeedChkBackend(ierr);
300   }
301   ierr = CeedVectorRestoreArray(V, &v); CeedChkBackend(ierr);
302   return CEED_ERROR_SUCCESS;
303 }
304 
305 #ifdef __cplusplus
306 CEED_INTERN "C"
307 #endif
308 int CeedBasisApplyNonTensor_f64_Magma(CeedBasis basis, CeedInt nelem,
309                                       CeedTransposeMode tmode, CeedEvalMode emode,
310                                       CeedVector U, CeedVector V) {
311   int ierr;
312   Ceed ceed;
313   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
314 
315   Ceed_Magma *data;
316   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
317 
318   CeedInt dim, ncomp, ndof, nqpt;
319   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
320   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
321   ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr);
322   ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr);
323   const CeedScalar *du;
324   CeedScalar *dv;
325   if (emode != CEED_EVAL_WEIGHT) {
326     ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr);
327   } else if (emode != CEED_EVAL_WEIGHT) {
328     // LCOV_EXCL_START
329     return CeedError(ceed, CEED_ERROR_BACKEND,
330                      "An input vector is required for this CeedEvalMode");
331     // LCOV_EXCL_STOP
332   }
333   ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr);
334 
335   CeedBasisNonTensor_Magma *impl;
336   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
337 
338   CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT
339                ", comp = %" CeedInt_FMT, ncomp*ndof, ncomp);
340 
341   if (tmode == CEED_TRANSPOSE) {
342     CeedSize length;
343     ierr = CeedVectorGetLength(V, &length);
344     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
345       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) dv, length,
346                        data->queue);
347     } else {
348       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) dv, length,
349                        data->queue);
350     }
351     ceed_magma_queue_sync( data->queue );
352   }
353 
354   switch (emode) {
355   case CEED_EVAL_INTERP: {
356     CeedInt P = ndof, Q = nqpt;
357     if (tmode == CEED_TRANSPOSE)
358       magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
359                             P, nelem*ncomp, Q,
360                             1.0, (double *)impl->dinterp, P,
361                             (double *)du, Q,
362                             0.0, (double *)dv, P, data->queue);
363     else
364       magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans,
365                             Q, nelem*ncomp, P,
366                             1.0, (double *)impl->dinterp, P,
367                             (double *)du, P,
368                             0.0, (double *)dv, Q, data->queue);
369   }
370   break;
371 
372   case CEED_EVAL_GRAD: {
373     CeedInt P = ndof, Q = nqpt;
374     if (tmode == CEED_TRANSPOSE) {
375       CeedScalar beta = 0.0;
376       for(int d=0; d<dim; d++) {
377         if (d>0)
378           beta = 1.0;
379         magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
380                               P, nelem*ncomp, Q,
381                               1.0, (double *)(impl->dgrad + d*P*Q), P,
382                               (double *)(du + d*nelem*ncomp*Q), Q,
383                               beta, (double *)dv, P, data->queue);
384       }
385     } else {
386       for(int d=0; d< dim; d++)
387         magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans,
388                               Q, nelem*ncomp, P,
389                               1.0, (double *)(impl->dgrad + d*P*Q), P,
390                               (double *)du, P,
391                               0.0, (double *)(dv + d*nelem*ncomp*Q), Q, data->queue);
392     }
393   }
394   break;
395 
396   case CEED_EVAL_WEIGHT: {
397     if (tmode == CEED_TRANSPOSE)
398       // LCOV_EXCL_START
399       return CeedError(ceed, CEED_ERROR_BACKEND,
400                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
401     // LCOV_EXCL_STOP
402 
403     int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1;
404     int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)?
405                                        1 : 0 );
406     magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv,
407                            data->queue);
408     CeedChkBackend(ierr);
409   }
410   break;
411 
412   // LCOV_EXCL_START
413   case CEED_EVAL_DIV:
414     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
415   case CEED_EVAL_CURL:
416     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
417   case CEED_EVAL_NONE:
418     return CeedError(ceed, CEED_ERROR_BACKEND,
419                      "CEED_EVAL_NONE does not make sense in this context");
420     // LCOV_EXCL_STOP
421   }
422 
423   // must sync to ensure completeness
424   ceed_magma_queue_sync( data->queue );
425 
426   if (emode!=CEED_EVAL_WEIGHT) {
427     ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr);
428   }
429   ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr);
430   return CEED_ERROR_SUCCESS;
431 }
432 
433 int CeedBasisApplyNonTensor_f32_Magma(CeedBasis basis, CeedInt nelem,
434                                       CeedTransposeMode tmode, CeedEvalMode emode,
435                                       CeedVector U, CeedVector V) {
436   int ierr;
437   Ceed ceed;
438   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
439 
440   Ceed_Magma *data;
441   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
442 
443   CeedInt dim, ncomp, ndof, nqpt;
444   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
445   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
446   ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr);
447   ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr);
448   const CeedScalar *du;
449   CeedScalar *dv;
450   if (emode != CEED_EVAL_WEIGHT) {
451     ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr);
452   } else if (emode != CEED_EVAL_WEIGHT) {
453     // LCOV_EXCL_START
454     return CeedError(ceed, CEED_ERROR_BACKEND,
455                      "An input vector is required for this CeedEvalMode");
456     // LCOV_EXCL_STOP
457   }
458   ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr);
459 
460   CeedBasisNonTensor_Magma *impl;
461   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
462 
463   CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT
464                ", comp = %" CeedInt_FMT, ncomp*ndof, ncomp);
465 
466   if (tmode == CEED_TRANSPOSE) {
467     CeedSize length;
468     ierr = CeedVectorGetLength(V, &length);
469     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
470       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) dv, length,
471                        data->queue);
472     } else {
473       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) dv, length,
474                        data->queue);
475     }
476     ceed_magma_queue_sync( data->queue );
477   }
478 
479   switch (emode) {
480   case CEED_EVAL_INTERP: {
481     CeedInt P = ndof, Q = nqpt;
482     if (tmode == CEED_TRANSPOSE)
483       magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
484                             P, nelem*ncomp, Q,
485                             1.0, (float *)impl->dinterp, P,
486                             (float *)du, Q,
487                             0.0, (float *)dv, P, data->queue);
488     else
489       magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans,
490                             Q, nelem*ncomp, P,
491                             1.0, (float *)impl->dinterp, P,
492                             (float *)du, P,
493                             0.0, (float *)dv, Q, data->queue);
494   }
495   break;
496 
497   case CEED_EVAL_GRAD: {
498     CeedInt P = ndof, Q = nqpt;
499     if (tmode == CEED_TRANSPOSE) {
500       CeedScalar beta = 0.0;
501       for(int d=0; d<dim; d++) {
502         if (d>0)
503           beta = 1.0;
504         magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
505                               P, nelem*ncomp, Q,
506                               1.0, (float *)(impl->dgrad + d*P*Q), P,
507                               (float *)(du + d*nelem*ncomp*Q), Q,
508                               beta, (float *)dv, P, data->queue);
509       }
510     } else {
511       for(int d=0; d< dim; d++)
512         magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans,
513                               Q, nelem*ncomp, P,
514                               1.0, (float *)(impl->dgrad + d*P*Q), P,
515                               (float *)du, P,
516                               0.0, (float *)(dv + d*nelem*ncomp*Q), Q, data->queue);
517     }
518   }
519   break;
520 
521   case CEED_EVAL_WEIGHT: {
522     if (tmode == CEED_TRANSPOSE)
523       // LCOV_EXCL_START
524       return CeedError(ceed, CEED_ERROR_BACKEND,
525                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
526     // LCOV_EXCL_STOP
527 
528     int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1;
529     int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)?
530                                        1 : 0 );
531     magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv,
532                            data->queue);
533     CeedChkBackend(ierr);
534   }
535   break;
536 
537   // LCOV_EXCL_START
538   case CEED_EVAL_DIV:
539     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
540   case CEED_EVAL_CURL:
541     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
542   case CEED_EVAL_NONE:
543     return CeedError(ceed, CEED_ERROR_BACKEND,
544                      "CEED_EVAL_NONE does not make sense in this context");
545     // LCOV_EXCL_STOP
546   }
547 
548   // must sync to ensure completeness
549   ceed_magma_queue_sync( data->queue );
550 
551   if (emode!=CEED_EVAL_WEIGHT) {
552     ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr);
553   }
554   ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr);
555   return CEED_ERROR_SUCCESS;
556 }
557 
558 #ifdef __cplusplus
559 CEED_INTERN "C"
560 #endif
561 int CeedBasisDestroy_Magma(CeedBasis basis) {
562   int ierr;
563   CeedBasis_Magma *impl;
564   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
565 
566   ierr = magma_free(impl->dqref1d); CeedChkBackend(ierr);
567   ierr = magma_free(impl->dinterp1d); CeedChkBackend(ierr);
568   ierr = magma_free(impl->dgrad1d); CeedChkBackend(ierr);
569   ierr = magma_free(impl->dqweight1d); CeedChkBackend(ierr);
570   Ceed ceed;
571   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
572   #ifdef CEED_MAGMA_USE_HIP
573   ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr);
574   #else
575   ierr = cuModuleUnload(impl->module); CeedChk_Cu(ceed, ierr);
576   #endif
577 
578   ierr = CeedFree(&impl); CeedChkBackend(ierr);
579 
580   return CEED_ERROR_SUCCESS;
581 }
582 
583 #ifdef __cplusplus
584 CEED_INTERN "C"
585 #endif
586 int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
587   int ierr;
588   CeedBasisNonTensor_Magma *impl;
589   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
590 
591   ierr = magma_free(impl->dqref); CeedChkBackend(ierr);
592   ierr = magma_free(impl->dinterp); CeedChkBackend(ierr);
593   ierr = magma_free(impl->dgrad); CeedChkBackend(ierr);
594   ierr = magma_free(impl->dqweight); CeedChkBackend(ierr);
595 
596   ierr = CeedFree(&impl); CeedChkBackend(ierr);
597 
598   return CEED_ERROR_SUCCESS;
599 }
600 
601 #ifdef __cplusplus
602 CEED_INTERN "C"
603 #endif
604 int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d,
605                                   const CeedScalar *interp1d,
606                                   const CeedScalar *grad1d,
607                                   const CeedScalar *qref1d,
608                                   const CeedScalar *qweight1d, CeedBasis basis) {
609   int ierr;
610   CeedBasis_Magma *impl;
611   ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr);
612   Ceed ceed;
613   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
614 
615   // Check for supported parameters
616   CeedInt ncomp = 0;
617   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
618   Ceed_Magma *data;
619   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
620 
621   // Compile kernels
622   char *magma_common_path;
623   char *interp_path, *grad_path, *weight_path;
624   char *basis_kernel_source;
625   ierr = CeedGetJitAbsolutePath(ceed,
626                                 "ceed/jit-source/magma/magma_common_device.h",
627                                 &magma_common_path); CeedChkBackend(ierr);
628   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
629   ierr = CeedLoadSourceToBuffer(ceed, magma_common_path,
630                                 &basis_kernel_source);
631   CeedChkBackend(ierr);
632   char *interp_name_base = "ceed/jit-source/magma/interp";
633   CeedInt interp_name_len = strlen(interp_name_base) + 6;
634   char interp_name[interp_name_len];
635   snprintf(interp_name, interp_name_len, "%s-%" CeedInt_FMT "d.h",
636            interp_name_base, dim);
637   ierr = CeedGetJitAbsolutePath(ceed, interp_name, &interp_path);
638   CeedChkBackend(ierr);
639   ierr = CeedLoadSourceToInitializedBuffer(ceed, interp_path,
640          &basis_kernel_source);
641   CeedChkBackend(ierr);
642   char *grad_name_base = "ceed/jit-source/magma/grad";
643   CeedInt grad_name_len = strlen(grad_name_base) + 6;
644   char grad_name[grad_name_len];
645   snprintf(grad_name, grad_name_len, "%s-%" CeedInt_FMT "d.h", grad_name_base,
646            dim);
647   ierr = CeedGetJitAbsolutePath(ceed, grad_name, &grad_path);
648   CeedChkBackend(ierr);
649   ierr = CeedLoadSourceToInitializedBuffer(ceed, grad_path,
650          &basis_kernel_source);
651   CeedChkBackend(ierr);
652   char *weight_name_base = "ceed/jit-source/magma/weight";
653   CeedInt weight_name_len = strlen(weight_name_base) + 6;
654   char weight_name[weight_name_len];
655   snprintf(weight_name, weight_name_len, "%s-%" CeedInt_FMT "d.h",
656            weight_name_base, dim);
657   ierr = CeedGetJitAbsolutePath(ceed, weight_name, &weight_path);
658   CeedChkBackend(ierr);
659   ierr = CeedLoadSourceToInitializedBuffer(ceed, weight_path,
660          &basis_kernel_source);
661   CeedChkBackend(ierr);
662   CeedDebug256(ceed, 2,
663                "----- Loading Basis Kernel Source Complete! -----\n");
664   // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip
665   // data
666   Ceed delegate;
667   ierr = CeedGetDelegate(ceed, &delegate); CeedChkBackend(ierr);
668   ierr = CeedCompileMagma(delegate, basis_kernel_source, &impl->module, 5,
669                           "DIM", dim,
670                           "NCOMP", ncomp,
671                           "P", P1d,
672                           "Q", Q1d,
673                           "MAXPQ", CeedIntMax(P1d, Q1d));
674   CeedChkBackend(ierr);
675 
676   // Kernel setup
677   switch (dim) {
678   case 1:
679     ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel",
680                               &impl->magma_interp);
681     CeedChkBackend(ierr);
682     ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel",
683                               &impl->magma_interp_tr);
684     CeedChkBackend(ierr);
685     ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel",
686                               &impl->magma_grad);
687     CeedChkBackend(ierr);
688     ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel",
689                               &impl->magma_grad_tr);
690     CeedChkBackend(ierr);
691     ierr = CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel",
692                               &impl->magma_weight);
693     CeedChkBackend(ierr);
694     break;
695   case 2:
696     ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel",
697                               &impl->magma_interp);
698     CeedChkBackend(ierr);
699     ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel",
700                               &impl->magma_interp_tr);
701     CeedChkBackend(ierr);
702     ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel",
703                               &impl->magma_grad);
704     CeedChkBackend(ierr);
705     ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel",
706                               &impl->magma_grad_tr);
707     CeedChkBackend(ierr);
708     ierr = CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel",
709                               &impl->magma_weight);
710     CeedChkBackend(ierr);
711     break;
712   case 3:
713     ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel",
714                               &impl->magma_interp);
715     CeedChkBackend(ierr);
716     ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel",
717                               &impl->magma_interp_tr);
718     CeedChkBackend(ierr);
719     ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel",
720                               &impl->magma_grad);
721     CeedChkBackend(ierr);
722     ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel",
723                               &impl->magma_grad_tr);
724     CeedChkBackend(ierr);
725     ierr = CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel",
726                               &impl->magma_weight);
727     CeedChkBackend(ierr);
728   }
729 
730   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
731                                 CeedBasisApply_Magma); CeedChkBackend(ierr);
732   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
733                                 CeedBasisDestroy_Magma); CeedChkBackend(ierr);
734 
735   // Copy qref1d to the GPU
736   ierr = magma_malloc((void **)&impl->dqref1d, Q1d*sizeof(qref1d[0]));
737   CeedChkBackend(ierr);
738   magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1,
739                   data->queue);
740 
741   // Copy interp1d to the GPU
742   ierr = magma_malloc((void **)&impl->dinterp1d, Q1d*P1d*sizeof(interp1d[0]));
743   CeedChkBackend(ierr);
744   magma_setvector(Q1d*P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1,
745                   data->queue);
746 
747   // Copy grad1d to the GPU
748   ierr = magma_malloc((void **)&impl->dgrad1d, Q1d*P1d*sizeof(grad1d[0]));
749   CeedChkBackend(ierr);
750   magma_setvector(Q1d*P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1,
751                   data->queue);
752 
753   // Copy qweight1d to the GPU
754   ierr = magma_malloc((void **)&impl->dqweight1d, Q1d*sizeof(qweight1d[0]));
755   CeedChkBackend(ierr);
756   magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1,
757                   data->queue);
758 
759   ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr);
760 
761   return CEED_ERROR_SUCCESS;
762 }
763 
764 #ifdef __cplusplus
765 CEED_INTERN "C"
766 #endif
767 int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof,
768                             CeedInt nqpts, const CeedScalar *interp,
769                             const CeedScalar *grad, const CeedScalar *qref,
770                             const CeedScalar *qweight, CeedBasis basis) {
771   int ierr;
772   CeedBasisNonTensor_Magma *impl;
773   Ceed ceed;
774   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
775 
776   Ceed_Magma *data;
777   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
778 
779   if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
780     ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
781                                   CeedBasisApplyNonTensor_f64_Magma);
782     CeedChkBackend(ierr);
783   } else {
784     ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
785                                   CeedBasisApplyNonTensor_f32_Magma);
786     CeedChkBackend(ierr);
787   }
788   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
789                                 CeedBasisDestroyNonTensor_Magma); CeedChkBackend(ierr);
790 
791   ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr);
792   ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr);
793 
794   // Copy qref to the GPU
795   ierr = magma_malloc((void **)&impl->dqref, nqpts*sizeof(qref[0]));
796   CeedChkBackend(ierr);
797   magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue);
798 
799   // Copy interp to the GPU
800   ierr = magma_malloc((void **)&impl->dinterp, nqpts*ndof*sizeof(interp[0]));
801   CeedChkBackend(ierr);
802   magma_setvector(nqpts*ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1,
803                   data->queue);
804 
805   // Copy grad to the GPU
806   ierr = magma_malloc((void **)&impl->dgrad, nqpts*ndof*dim*sizeof(grad[0]));
807   CeedChkBackend(ierr);
808   magma_setvector(nqpts*ndof*dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1,
809                   data->queue);
810 
811   // Copy qweight to the GPU
812   ierr = magma_malloc((void **)&impl->dqweight, nqpts*sizeof(qweight[0]));
813   CeedChkBackend(ierr);
814   magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1,
815                   data->queue);
816 
817   return CEED_ERROR_SUCCESS;
818 }
819