xref: /libCEED/backends/magma/ceed-magma-basis.c (revision 3f21f6b10abeb5d85d3454ea5cd38498737dc88a)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed/ceed.h>
18 #include <ceed/backend.h>
19 #include "ceed-magma.h"
20 
21 #ifdef __cplusplus
22 CEED_INTERN "C"
23 #endif
24 int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem,
25                          CeedTransposeMode tmode, CeedEvalMode emode,
26                          CeedVector U, CeedVector V) {
27   int ierr;
28   Ceed ceed;
29   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
30   CeedInt dim, ncomp, ndof;
31   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
32   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
33   ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr);
34 
35   Ceed_Magma *data;
36   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
37 
38   const CeedScalar *u;
39   CeedScalar *v;
40   if (emode != CEED_EVAL_WEIGHT) {
41     ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u); CeedChkBackend(ierr);
42   } else if (emode != CEED_EVAL_WEIGHT) {
43     // LCOV_EXCL_START
44     return CeedError(ceed, CEED_ERROR_BACKEND,
45                      "An input vector is required for this CeedEvalMode");
46     // LCOV_EXCL_STOP
47   }
48   ierr = CeedVectorGetArray(V, CEED_MEM_DEVICE, &v); CeedChkBackend(ierr);
49 
50   CeedBasis_Magma *impl;
51   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
52 
53   CeedInt P1d, Q1d;
54   ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
55   ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
56 
57   CeedDebug(ceed, "\033[01m[CeedBasisApply_Magma] vsize=%d, comp = %d",
58             ncomp*CeedIntPow(P1d, dim), ncomp);
59 
60   if (tmode == CEED_TRANSPOSE) {
61     CeedInt length;
62     ierr = CeedVectorGetLength(V, &length); CeedChkBackend(ierr);
63     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
64       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) v, length,
65                        data->queue);
66     } else {
67       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) v, length,
68                        data->queue);
69     }
70     ceed_magma_queue_sync( data->queue );
71   }
72   switch (emode) {
73   case CEED_EVAL_INTERP: {
74     CeedInt P = P1d, Q = Q1d;
75     if (tmode == CEED_TRANSPOSE) {
76       P = Q1d; Q = P1d;
77     }
78 
79     // Define element sizes for dofs/quad
80     CeedInt elquadsize = CeedIntPow(Q1d, dim);
81     CeedInt eldofssize = CeedIntPow(P1d, dim);
82 
83     // E-vector ordering -------------- Q-vector ordering
84     //  component                        component
85     //    elem                             elem
86     //       node                            node
87 
88     // ---  Define strides for NOTRANSPOSE mode: ---
89     // Input (u) is E-vector, output (v) is Q-vector
90 
91     // Element strides
92     CeedInt u_elstride = eldofssize;
93     CeedInt v_elstride = elquadsize;
94     // Component strides
95     CeedInt u_compstride = nelem * eldofssize;
96     CeedInt v_compstride = nelem * elquadsize;
97 
98     // ---  Swap strides for TRANSPOSE mode: ---
99     if (tmode == CEED_TRANSPOSE) {
100       // Input (u) is Q-vector, output (v) is E-vector
101       // Element strides
102       v_elstride = eldofssize;
103       u_elstride = elquadsize;
104       // Component strides
105       v_compstride = nelem * eldofssize;
106       u_compstride = nelem * elquadsize;
107     }
108 
109     ierr = magma_interp(P, Q, dim, ncomp,
110                         impl->dinterp1d, tmode,
111                         u, u_elstride, u_compstride,
112                         v, v_elstride, v_compstride,
113                         nelem, data->basis_kernel_mode, data->maxthreads,
114                         data->queue);
115     if (ierr != 0) return CeedError(ceed, CEED_ERROR_BACKEND,
116                                       "MAGMA: launch failure detected for magma_interp");
117   }
118   break;
119   case CEED_EVAL_GRAD: {
120     CeedInt P = P1d, Q = Q1d;
121     // In CEED_NOTRANSPOSE mode:
122     // u is (P^dim x nc), column-major layout (nc = ncomp)
123     // v is (Q^dim x nc x dim), column-major layout (nc = ncomp)
124     // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
125     if (tmode == CEED_TRANSPOSE) {
126       P = Q1d, Q = P1d;
127     }
128 
129     // Define element sizes for dofs/quad
130     CeedInt elquadsize = CeedIntPow(Q1d, dim);
131     CeedInt eldofssize = CeedIntPow(P1d, dim);
132 
133     // E-vector ordering -------------- Q-vector ordering
134     //                                  dim
135     //  component                        component
136     //    elem                              elem
137     //       node                            node
138 
139     // ---  Define strides for NOTRANSPOSE mode: ---
140     // Input (u) is E-vector, output (v) is Q-vector
141 
142     // Element strides
143     CeedInt u_elstride = eldofssize;
144     CeedInt v_elstride = elquadsize;
145     // Component strides
146     CeedInt u_compstride = nelem * eldofssize;
147     CeedInt v_compstride = nelem * elquadsize;
148     // Dimension strides
149     CeedInt u_dimstride = 0;
150     CeedInt v_dimstride = nelem * elquadsize * ncomp;
151 
152     // ---  Swap strides for TRANSPOSE mode: ---
153     if (tmode == CEED_TRANSPOSE) {
154       // Input (u) is Q-vector, output (v) is E-vector
155       // Element strides
156       v_elstride = eldofssize;
157       u_elstride = elquadsize;
158       // Component strides
159       v_compstride = nelem * eldofssize;
160       u_compstride = nelem * elquadsize;
161       // Dimension strides
162       v_dimstride = 0;
163       u_dimstride = nelem * elquadsize * ncomp;
164 
165     }
166 
167     ierr = magma_grad( P, Q, dim, ncomp,
168                        impl->dinterp1d, impl->dgrad1d, tmode,
169                        u, u_elstride, u_compstride, u_dimstride,
170                        v, v_elstride, v_compstride, v_dimstride,
171                        nelem, data->basis_kernel_mode, data->maxthreads,
172                        data->queue);
173     if (ierr != 0) return CeedError(ceed, CEED_ERROR_BACKEND,
174                                       "MAGMA: launch failure detected for magma_grad");
175   }
176   break;
177   case CEED_EVAL_WEIGHT: {
178     if (tmode == CEED_TRANSPOSE)
179       // LCOV_EXCL_START
180       return CeedError(ceed, CEED_ERROR_BACKEND,
181                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
182     // LCOV_EXCL_STOP
183     CeedInt Q = Q1d;
184     int eldofssize = CeedIntPow(Q, dim);
185     ierr = magma_weight(Q, dim, impl->dqweight1d, v, eldofssize, nelem,
186                         data->basis_kernel_mode, data->maxthreads, data->queue);
187     if (ierr != 0) return CeedError(ceed, CEED_ERROR_BACKEND,
188                                       "MAGMA: launch failure detected for magma_weight");
189   }
190   break;
191   // LCOV_EXCL_START
192   case CEED_EVAL_DIV:
193     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
194   case CEED_EVAL_CURL:
195     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
196   case CEED_EVAL_NONE:
197     return CeedError(ceed, CEED_ERROR_BACKEND,
198                      "CEED_EVAL_NONE does not make sense in this context");
199     // LCOV_EXCL_STOP
200   }
201 
202   // must sync to ensure completeness
203   ceed_magma_queue_sync( data->queue );
204 
205   if (emode!=CEED_EVAL_WEIGHT) {
206     ierr = CeedVectorRestoreArrayRead(U, &u); CeedChkBackend(ierr);
207   }
208   ierr = CeedVectorRestoreArray(V, &v); CeedChkBackend(ierr);
209   return CEED_ERROR_SUCCESS;
210 }
211 
212 #ifdef __cplusplus
213 CEED_INTERN "C"
214 #endif
215 int CeedBasisApplyNonTensor_f64_Magma(CeedBasis basis, CeedInt nelem,
216                                       CeedTransposeMode tmode, CeedEvalMode emode,
217                                       CeedVector U, CeedVector V) {
218   int ierr;
219   Ceed ceed;
220   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
221 
222   Ceed_Magma *data;
223   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
224 
225   CeedInt dim, ncomp, ndof, nqpt;
226   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
227   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
228   ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr);
229   ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr);
230   const CeedScalar *du;
231   CeedScalar *dv;
232   if (emode != CEED_EVAL_WEIGHT) {
233     ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr);
234   } else if (emode != CEED_EVAL_WEIGHT) {
235     // LCOV_EXCL_START
236     return CeedError(ceed, CEED_ERROR_BACKEND,
237                      "An input vector is required for this CeedEvalMode");
238     // LCOV_EXCL_STOP
239   }
240   ierr = CeedVectorGetArray(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr);
241 
242   CeedBasisNonTensor_Magma *impl;
243   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
244 
245   CeedDebug(ceed, "\033[01m[CeedBasisApplyNonTensor_Magma] vsize=%d, comp = %d",
246             ncomp*ndof, ncomp);
247 
248   if (tmode == CEED_TRANSPOSE) {
249     CeedInt length;
250     ierr = CeedVectorGetLength(V, &length);
251     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
252       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) dv, length,
253                        data->queue);
254     } else {
255       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) dv, length,
256                        data->queue);
257     }
258     ceed_magma_queue_sync( data->queue );
259   }
260 
261   switch (emode) {
262   case CEED_EVAL_INTERP: {
263     CeedInt P = ndof, Q = nqpt;
264     if (tmode == CEED_TRANSPOSE)
265       magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
266                             P, nelem*ncomp, Q,
267                             1.0, (double *)impl->dinterp, P,
268                             (double *)du, Q,
269                             0.0, (double *)dv, P, data->queue);
270     else
271       magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans,
272                             Q, nelem*ncomp, P,
273                             1.0, (double *)impl->dinterp, P,
274                             (double *)du, P,
275                             0.0, (double *)dv, Q, data->queue);
276   }
277   break;
278 
279   case CEED_EVAL_GRAD: {
280     CeedInt P = ndof, Q = nqpt;
281     if (tmode == CEED_TRANSPOSE) {
282       CeedScalar beta = 0.0;
283       for(int d=0; d<dim; d++) {
284         if (d>0)
285           beta = 1.0;
286         magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
287                               P, nelem*ncomp, Q,
288                               1.0, (double *)(impl->dgrad + d*P*Q), P,
289                               (double *)(du + d*nelem*ncomp*Q), Q,
290                               beta, (double *)dv, P, data->queue);
291       }
292     } else {
293       for(int d=0; d< dim; d++)
294         magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans,
295                               Q, nelem*ncomp, P,
296                               1.0, (double *)(impl->dgrad + d*P*Q), P,
297                               (double *)du, P,
298                               0.0, (double *)(dv + d*nelem*ncomp*Q), Q, data->queue);
299     }
300   }
301   break;
302 
303   case CEED_EVAL_WEIGHT: {
304     if (tmode == CEED_TRANSPOSE)
305       // LCOV_EXCL_START
306       return CeedError(ceed, CEED_ERROR_BACKEND,
307                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
308     // LCOV_EXCL_STOP
309 
310     int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1;
311     int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)?
312                                        1 : 0 );
313     magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv,
314                            data->queue);
315     CeedChkBackend(ierr);
316   }
317   break;
318 
319   // LCOV_EXCL_START
320   case CEED_EVAL_DIV:
321     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
322   case CEED_EVAL_CURL:
323     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
324   case CEED_EVAL_NONE:
325     return CeedError(ceed, CEED_ERROR_BACKEND,
326                      "CEED_EVAL_NONE does not make sense in this context");
327     // LCOV_EXCL_STOP
328   }
329 
330   // must sync to ensure completeness
331   ceed_magma_queue_sync( data->queue );
332 
333   if (emode!=CEED_EVAL_WEIGHT) {
334     ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr);
335   }
336   ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr);
337   return CEED_ERROR_SUCCESS;
338 }
339 
340 int CeedBasisApplyNonTensor_f32_Magma(CeedBasis basis, CeedInt nelem,
341                                       CeedTransposeMode tmode, CeedEvalMode emode,
342                                       CeedVector U, CeedVector V) {
343   int ierr;
344   Ceed ceed;
345   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
346 
347   Ceed_Magma *data;
348   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
349 
350   CeedInt dim, ncomp, ndof, nqpt;
351   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
352   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
353   ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr);
354   ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr);
355   const CeedScalar *du;
356   CeedScalar *dv;
357   if (emode != CEED_EVAL_WEIGHT) {
358     ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr);
359   } else if (emode != CEED_EVAL_WEIGHT) {
360     // LCOV_EXCL_START
361     return CeedError(ceed, CEED_ERROR_BACKEND,
362                      "An input vector is required for this CeedEvalMode");
363     // LCOV_EXCL_STOP
364   }
365   ierr = CeedVectorGetArray(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr);
366 
367   CeedBasisNonTensor_Magma *impl;
368   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
369 
370   CeedDebug(ceed, "\033[01m[CeedBasisApplyNonTensor_Magma] vsize=%d, comp = %d",
371             ncomp*ndof, ncomp);
372 
373   if (tmode == CEED_TRANSPOSE) {
374     CeedInt length;
375     ierr = CeedVectorGetLength(V, &length);
376     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
377       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) dv, length,
378                        data->queue);
379     } else {
380       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) dv, length,
381                        data->queue);
382     }
383     ceed_magma_queue_sync( data->queue );
384   }
385 
386   switch (emode) {
387   case CEED_EVAL_INTERP: {
388     CeedInt P = ndof, Q = nqpt;
389     if (tmode == CEED_TRANSPOSE)
390       magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
391                             P, nelem*ncomp, Q,
392                             1.0, (float *)impl->dinterp, P,
393                             (float *)du, Q,
394                             0.0, (float *)dv, P, data->queue);
395     else
396       magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans,
397                             Q, nelem*ncomp, P,
398                             1.0, (float *)impl->dinterp, P,
399                             (float *)du, P,
400                             0.0, (float *)dv, Q, data->queue);
401   }
402   break;
403 
404   case CEED_EVAL_GRAD: {
405     CeedInt P = ndof, Q = nqpt;
406     if (tmode == CEED_TRANSPOSE) {
407       CeedScalar beta = 0.0;
408       for(int d=0; d<dim; d++) {
409         if (d>0)
410           beta = 1.0;
411         magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
412                               P, nelem*ncomp, Q,
413                               1.0, (float *)(impl->dgrad + d*P*Q), P,
414                               (float *)(du + d*nelem*ncomp*Q), Q,
415                               beta, (float *)dv, P, data->queue);
416       }
417     } else {
418       for(int d=0; d< dim; d++)
419         magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans,
420                               Q, nelem*ncomp, P,
421                               1.0, (float *)(impl->dgrad + d*P*Q), P,
422                               (float *)du, P,
423                               0.0, (float *)(dv + d*nelem*ncomp*Q), Q, data->queue);
424     }
425   }
426   break;
427 
428   case CEED_EVAL_WEIGHT: {
429     if (tmode == CEED_TRANSPOSE)
430       // LCOV_EXCL_START
431       return CeedError(ceed, CEED_ERROR_BACKEND,
432                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
433     // LCOV_EXCL_STOP
434 
435     int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1;
436     int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)?
437                                        1 : 0 );
438     magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv,
439                            data->queue);
440     CeedChkBackend(ierr);
441   }
442   break;
443 
444   // LCOV_EXCL_START
445   case CEED_EVAL_DIV:
446     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
447   case CEED_EVAL_CURL:
448     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
449   case CEED_EVAL_NONE:
450     return CeedError(ceed, CEED_ERROR_BACKEND,
451                      "CEED_EVAL_NONE does not make sense in this context");
452     // LCOV_EXCL_STOP
453   }
454 
455   // must sync to ensure completeness
456   ceed_magma_queue_sync( data->queue );
457 
458   if (emode!=CEED_EVAL_WEIGHT) {
459     ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr);
460   }
461   ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr);
462   return CEED_ERROR_SUCCESS;
463 }
464 
465 #ifdef __cplusplus
466 CEED_INTERN "C"
467 #endif
468 int CeedBasisDestroy_Magma(CeedBasis basis) {
469   int ierr;
470   CeedBasis_Magma *impl;
471   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
472 
473   ierr = magma_free(impl->dqref1d); CeedChkBackend(ierr);
474   ierr = magma_free(impl->dinterp1d); CeedChkBackend(ierr);
475   ierr = magma_free(impl->dgrad1d); CeedChkBackend(ierr);
476   ierr = magma_free(impl->dqweight1d); CeedChkBackend(ierr);
477 
478   ierr = CeedFree(&impl); CeedChkBackend(ierr);
479 
480   return CEED_ERROR_SUCCESS;
481 }
482 
483 #ifdef __cplusplus
484 CEED_INTERN "C"
485 #endif
486 int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
487   int ierr;
488   CeedBasisNonTensor_Magma *impl;
489   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
490 
491   ierr = magma_free(impl->dqref); CeedChkBackend(ierr);
492   ierr = magma_free(impl->dinterp); CeedChkBackend(ierr);
493   ierr = magma_free(impl->dgrad); CeedChkBackend(ierr);
494   ierr = magma_free(impl->dqweight); CeedChkBackend(ierr);
495 
496   ierr = CeedFree(&impl); CeedChkBackend(ierr);
497 
498   return CEED_ERROR_SUCCESS;
499 }
500 
501 #ifdef __cplusplus
502 CEED_INTERN "C"
503 #endif
504 int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d,
505                                   const CeedScalar *interp1d,
506                                   const CeedScalar *grad1d,
507                                   const CeedScalar *qref1d,
508                                   const CeedScalar *qweight1d, CeedBasis basis) {
509   int ierr;
510   CeedBasis_Magma *impl;
511   Ceed ceed;
512   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
513 
514   // Check for supported parameters
515   CeedInt ncomp = 0;
516   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
517   if (ncomp > 3)
518     // LCOV_EXCL_START
519     return CeedError(ceed, CEED_ERROR_BACKEND,
520                      "Magma backend does not support tensor bases with more than 3 components");
521   // LCOV_EXCL_STOP
522   if (P1d > 10)
523     // LCOV_EXCL_START
524     return CeedError(ceed, CEED_ERROR_BACKEND,
525                      "Magma backend does not support tensor bases with more than 10 nodes in each dimension");
526   // LCOV_EXCL_STOP
527   if (Q1d > 10)
528     // LCOV_EXCL_START
529     return CeedError(ceed, CEED_ERROR_BACKEND,
530                      "Magma backend does not support tensor bases with more than 10 quadrature points in each dimension");
531   // LCOV_EXCL_STOP
532 
533   Ceed_Magma *data;
534   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
535 
536   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
537                                 CeedBasisApply_Magma); CeedChkBackend(ierr);
538   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
539                                 CeedBasisDestroy_Magma); CeedChkBackend(ierr);
540 
541   ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr);
542   ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr);
543 
544   // Copy qref1d to the GPU
545   ierr = magma_malloc((void **)&impl->dqref1d, Q1d*sizeof(qref1d[0]));
546   CeedChkBackend(ierr);
547   magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1,
548                   data->queue);
549 
550   // Copy interp1d to the GPU
551   ierr = magma_malloc((void **)&impl->dinterp1d, Q1d*P1d*sizeof(interp1d[0]));
552   CeedChkBackend(ierr);
553   magma_setvector(Q1d*P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1,
554                   data->queue);
555 
556   // Copy grad1d to the GPU
557   ierr = magma_malloc((void **)&impl->dgrad1d, Q1d*P1d*sizeof(grad1d[0]));
558   CeedChkBackend(ierr);
559   magma_setvector(Q1d*P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1,
560                   data->queue);
561 
562   // Copy qweight1d to the GPU
563   ierr = magma_malloc((void **)&impl->dqweight1d, Q1d*sizeof(qweight1d[0]));
564   CeedChkBackend(ierr);
565   magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1,
566                   data->queue);
567 
568   return CEED_ERROR_SUCCESS;
569 }
570 
571 #ifdef __cplusplus
572 CEED_INTERN "C"
573 #endif
574 int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof,
575                             CeedInt nqpts, const CeedScalar *interp,
576                             const CeedScalar *grad, const CeedScalar *qref,
577                             const CeedScalar *qweight, CeedBasis basis) {
578   int ierr;
579   CeedBasisNonTensor_Magma *impl;
580   Ceed ceed;
581   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
582 
583   Ceed_Magma *data;
584   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
585 
586   if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
587     ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
588                                   CeedBasisApplyNonTensor_f64_Magma);
589     CeedChkBackend(ierr);
590   } else {
591     ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
592                                   CeedBasisApplyNonTensor_f32_Magma);
593     CeedChkBackend(ierr);
594   }
595   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
596                                 CeedBasisDestroyNonTensor_Magma); CeedChkBackend(ierr);
597 
598   ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr);
599   ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr);
600 
601   // Copy qref to the GPU
602   ierr = magma_malloc((void **)&impl->dqref, nqpts*sizeof(qref[0]));
603   CeedChkBackend(ierr);
604   magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue);
605 
606   // Copy interp to the GPU
607   ierr = magma_malloc((void **)&impl->dinterp, nqpts*ndof*sizeof(interp[0]));
608   CeedChkBackend(ierr);
609   magma_setvector(nqpts*ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1,
610                   data->queue);
611 
612   // Copy grad to the GPU
613   ierr = magma_malloc((void **)&impl->dgrad, nqpts*ndof*dim*sizeof(grad[0]));
614   CeedChkBackend(ierr);
615   magma_setvector(nqpts*ndof*dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1,
616                   data->queue);
617 
618   // Copy qweight to the GPU
619   ierr = magma_malloc((void **)&impl->dqweight, nqpts*sizeof(qweight[0]));
620   CeedChkBackend(ierr);
621   magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1,
622                   data->queue);
623 
624   return CEED_ERROR_SUCCESS;
625 }
626