xref: /libCEED/backends/magma/ceed-magma-basis.c (revision 3d8e882215d238700cdceb37404f76ca7fa24eaa)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/ceed.h>
9 #include <ceed/backend.h>
10 #include "ceed-magma.h"
11 
12 #ifdef __cplusplus
13 CEED_INTERN "C"
14 #endif
15 int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem,
16                          CeedTransposeMode tmode, CeedEvalMode emode,
17                          CeedVector U, CeedVector V) {
18   int ierr;
19   Ceed ceed;
20   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
21   CeedInt dim, ncomp, ndof;
22   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
23   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
24   ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr);
25 
26   Ceed_Magma *data;
27   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
28 
29   const CeedScalar *u;
30   CeedScalar *v;
31   if (emode != CEED_EVAL_WEIGHT) {
32     ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u); CeedChkBackend(ierr);
33   } else if (emode != CEED_EVAL_WEIGHT) {
34     // LCOV_EXCL_START
35     return CeedError(ceed, CEED_ERROR_BACKEND,
36                      "An input vector is required for this CeedEvalMode");
37     // LCOV_EXCL_STOP
38   }
39   ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &v); CeedChkBackend(ierr);
40 
41   CeedBasis_Magma *impl;
42   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
43 
44   CeedInt P1d, Q1d;
45   ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
46   ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
47 
48   CeedDebug(ceed, "\033[01m[CeedBasisApply_Magma] vsize=%d, comp = %d",
49             ncomp*CeedIntPow(P1d, dim), ncomp);
50 
51   if (tmode == CEED_TRANSPOSE) {
52     CeedSize length;
53     ierr = CeedVectorGetLength(V, &length); CeedChkBackend(ierr);
54     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
55       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) v, length,
56                        data->queue);
57     } else {
58       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) v, length,
59                        data->queue);
60     }
61     ceed_magma_queue_sync( data->queue );
62   }
63   switch (emode) {
64   case CEED_EVAL_INTERP: {
65     CeedInt P = P1d, Q = Q1d;
66     if (tmode == CEED_TRANSPOSE) {
67       P = Q1d; Q = P1d;
68     }
69 
70     // Define element sizes for dofs/quad
71     CeedInt elquadsize = CeedIntPow(Q1d, dim);
72     CeedInt eldofssize = CeedIntPow(P1d, dim);
73 
74     // E-vector ordering -------------- Q-vector ordering
75     //  component                        component
76     //    elem                             elem
77     //       node                            node
78 
79     // ---  Define strides for NOTRANSPOSE mode: ---
80     // Input (u) is E-vector, output (v) is Q-vector
81 
82     // Element strides
83     CeedInt u_elstride = eldofssize;
84     CeedInt v_elstride = elquadsize;
85     // Component strides
86     CeedInt u_compstride = nelem * eldofssize;
87     CeedInt v_compstride = nelem * elquadsize;
88 
89     // ---  Swap strides for TRANSPOSE mode: ---
90     if (tmode == CEED_TRANSPOSE) {
91       // Input (u) is Q-vector, output (v) is E-vector
92       // Element strides
93       v_elstride = eldofssize;
94       u_elstride = elquadsize;
95       // Component strides
96       v_compstride = nelem * eldofssize;
97       u_compstride = nelem * elquadsize;
98     }
99 
100     ierr = magma_interp(P, Q, dim, ncomp,
101                         impl->dinterp1d, tmode,
102                         u, u_elstride, u_compstride,
103                         v, v_elstride, v_compstride,
104                         nelem, data->basis_kernel_mode,
105                         data->queue);
106     if (ierr != 0) return CeedError(ceed, CEED_ERROR_BACKEND,
107                                       "MAGMA: launch failure detected for magma_interp");
108   }
109   break;
110   case CEED_EVAL_GRAD: {
111     CeedInt P = P1d, Q = Q1d;
112     // In CEED_NOTRANSPOSE mode:
113     // u is (P^dim x nc), column-major layout (nc = ncomp)
114     // v is (Q^dim x nc x dim), column-major layout (nc = ncomp)
115     // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
116     if (tmode == CEED_TRANSPOSE) {
117       P = Q1d, Q = P1d;
118     }
119 
120     // Define element sizes for dofs/quad
121     CeedInt elquadsize = CeedIntPow(Q1d, dim);
122     CeedInt eldofssize = CeedIntPow(P1d, dim);
123 
124     // E-vector ordering -------------- Q-vector ordering
125     //                                  dim
126     //  component                        component
127     //    elem                              elem
128     //       node                            node
129 
130     // ---  Define strides for NOTRANSPOSE mode: ---
131     // Input (u) is E-vector, output (v) is Q-vector
132 
133     // Element strides
134     CeedInt u_elstride = eldofssize;
135     CeedInt v_elstride = elquadsize;
136     // Component strides
137     CeedInt u_compstride = nelem * eldofssize;
138     CeedInt v_compstride = nelem * elquadsize;
139     // Dimension strides
140     CeedInt u_dimstride = 0;
141     CeedInt v_dimstride = nelem * elquadsize * ncomp;
142 
143     // ---  Swap strides for TRANSPOSE mode: ---
144     if (tmode == CEED_TRANSPOSE) {
145       // Input (u) is Q-vector, output (v) is E-vector
146       // Element strides
147       v_elstride = eldofssize;
148       u_elstride = elquadsize;
149       // Component strides
150       v_compstride = nelem * eldofssize;
151       u_compstride = nelem * elquadsize;
152       // Dimension strides
153       v_dimstride = 0;
154       u_dimstride = nelem * elquadsize * ncomp;
155 
156     }
157 
158     ierr = magma_grad( P, Q, dim, ncomp,
159                        impl->dinterp1d, impl->dgrad1d, tmode,
160                        u, u_elstride, u_compstride, u_dimstride,
161                        v, v_elstride, v_compstride, v_dimstride,
162                        nelem, data->basis_kernel_mode,
163                        data->queue);
164     if (ierr != 0) return CeedError(ceed, CEED_ERROR_BACKEND,
165                                       "MAGMA: launch failure detected for magma_grad");
166   }
167   break;
168   case CEED_EVAL_WEIGHT: {
169     if (tmode == CEED_TRANSPOSE)
170       // LCOV_EXCL_START
171       return CeedError(ceed, CEED_ERROR_BACKEND,
172                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
173     // LCOV_EXCL_STOP
174     CeedInt Q = Q1d;
175     int eldofssize = CeedIntPow(Q, dim);
176     ierr = magma_weight(Q, dim, impl->dqweight1d, v, eldofssize, nelem,
177                         data->basis_kernel_mode, data->queue);
178     if (ierr != 0) return CeedError(ceed, CEED_ERROR_BACKEND,
179                                       "MAGMA: launch failure detected for magma_weight");
180   }
181   break;
182   // LCOV_EXCL_START
183   case CEED_EVAL_DIV:
184     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
185   case CEED_EVAL_CURL:
186     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
187   case CEED_EVAL_NONE:
188     return CeedError(ceed, CEED_ERROR_BACKEND,
189                      "CEED_EVAL_NONE does not make sense in this context");
190     // LCOV_EXCL_STOP
191   }
192 
193   // must sync to ensure completeness
194   ceed_magma_queue_sync( data->queue );
195 
196   if (emode!=CEED_EVAL_WEIGHT) {
197     ierr = CeedVectorRestoreArrayRead(U, &u); CeedChkBackend(ierr);
198   }
199   ierr = CeedVectorRestoreArray(V, &v); CeedChkBackend(ierr);
200   return CEED_ERROR_SUCCESS;
201 }
202 
203 #ifdef __cplusplus
204 CEED_INTERN "C"
205 #endif
206 int CeedBasisApplyNonTensor_f64_Magma(CeedBasis basis, CeedInt nelem,
207                                       CeedTransposeMode tmode, CeedEvalMode emode,
208                                       CeedVector U, CeedVector V) {
209   int ierr;
210   Ceed ceed;
211   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
212 
213   Ceed_Magma *data;
214   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
215 
216   CeedInt dim, ncomp, ndof, nqpt;
217   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
218   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
219   ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr);
220   ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr);
221   const CeedScalar *du;
222   CeedScalar *dv;
223   if (emode != CEED_EVAL_WEIGHT) {
224     ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr);
225   } else if (emode != CEED_EVAL_WEIGHT) {
226     // LCOV_EXCL_START
227     return CeedError(ceed, CEED_ERROR_BACKEND,
228                      "An input vector is required for this CeedEvalMode");
229     // LCOV_EXCL_STOP
230   }
231   ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr);
232 
233   CeedBasisNonTensor_Magma *impl;
234   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
235 
236   CeedDebug(ceed, "\033[01m[CeedBasisApplyNonTensor_Magma] vsize=%d, comp = %d",
237             ncomp*ndof, ncomp);
238 
239   if (tmode == CEED_TRANSPOSE) {
240     CeedSize length;
241     ierr = CeedVectorGetLength(V, &length);
242     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
243       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) dv, length,
244                        data->queue);
245     } else {
246       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) dv, length,
247                        data->queue);
248     }
249     ceed_magma_queue_sync( data->queue );
250   }
251 
252   switch (emode) {
253   case CEED_EVAL_INTERP: {
254     CeedInt P = ndof, Q = nqpt;
255     if (tmode == CEED_TRANSPOSE)
256       magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
257                             P, nelem*ncomp, Q,
258                             1.0, (double *)impl->dinterp, P,
259                             (double *)du, Q,
260                             0.0, (double *)dv, P, data->queue);
261     else
262       magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans,
263                             Q, nelem*ncomp, P,
264                             1.0, (double *)impl->dinterp, P,
265                             (double *)du, P,
266                             0.0, (double *)dv, Q, data->queue);
267   }
268   break;
269 
270   case CEED_EVAL_GRAD: {
271     CeedInt P = ndof, Q = nqpt;
272     if (tmode == CEED_TRANSPOSE) {
273       CeedScalar beta = 0.0;
274       for(int d=0; d<dim; d++) {
275         if (d>0)
276           beta = 1.0;
277         magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
278                               P, nelem*ncomp, Q,
279                               1.0, (double *)(impl->dgrad + d*P*Q), P,
280                               (double *)(du + d*nelem*ncomp*Q), Q,
281                               beta, (double *)dv, P, data->queue);
282       }
283     } else {
284       for(int d=0; d< dim; d++)
285         magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans,
286                               Q, nelem*ncomp, P,
287                               1.0, (double *)(impl->dgrad + d*P*Q), P,
288                               (double *)du, P,
289                               0.0, (double *)(dv + d*nelem*ncomp*Q), Q, data->queue);
290     }
291   }
292   break;
293 
294   case CEED_EVAL_WEIGHT: {
295     if (tmode == CEED_TRANSPOSE)
296       // LCOV_EXCL_START
297       return CeedError(ceed, CEED_ERROR_BACKEND,
298                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
299     // LCOV_EXCL_STOP
300 
301     int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1;
302     int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)?
303                                        1 : 0 );
304     magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv,
305                            data->queue);
306     CeedChkBackend(ierr);
307   }
308   break;
309 
310   // LCOV_EXCL_START
311   case CEED_EVAL_DIV:
312     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
313   case CEED_EVAL_CURL:
314     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
315   case CEED_EVAL_NONE:
316     return CeedError(ceed, CEED_ERROR_BACKEND,
317                      "CEED_EVAL_NONE does not make sense in this context");
318     // LCOV_EXCL_STOP
319   }
320 
321   // must sync to ensure completeness
322   ceed_magma_queue_sync( data->queue );
323 
324   if (emode!=CEED_EVAL_WEIGHT) {
325     ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr);
326   }
327   ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr);
328   return CEED_ERROR_SUCCESS;
329 }
330 
331 int CeedBasisApplyNonTensor_f32_Magma(CeedBasis basis, CeedInt nelem,
332                                       CeedTransposeMode tmode, CeedEvalMode emode,
333                                       CeedVector U, CeedVector V) {
334   int ierr;
335   Ceed ceed;
336   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
337 
338   Ceed_Magma *data;
339   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
340 
341   CeedInt dim, ncomp, ndof, nqpt;
342   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
343   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
344   ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr);
345   ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr);
346   const CeedScalar *du;
347   CeedScalar *dv;
348   if (emode != CEED_EVAL_WEIGHT) {
349     ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr);
350   } else if (emode != CEED_EVAL_WEIGHT) {
351     // LCOV_EXCL_START
352     return CeedError(ceed, CEED_ERROR_BACKEND,
353                      "An input vector is required for this CeedEvalMode");
354     // LCOV_EXCL_STOP
355   }
356   ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr);
357 
358   CeedBasisNonTensor_Magma *impl;
359   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
360 
361   CeedDebug(ceed, "\033[01m[CeedBasisApplyNonTensor_Magma] vsize=%d, comp = %d",
362             ncomp*ndof, ncomp);
363 
364   if (tmode == CEED_TRANSPOSE) {
365     CeedSize length;
366     ierr = CeedVectorGetLength(V, &length);
367     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
368       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) dv, length,
369                        data->queue);
370     } else {
371       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) dv, length,
372                        data->queue);
373     }
374     ceed_magma_queue_sync( data->queue );
375   }
376 
377   switch (emode) {
378   case CEED_EVAL_INTERP: {
379     CeedInt P = ndof, Q = nqpt;
380     if (tmode == CEED_TRANSPOSE)
381       magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
382                             P, nelem*ncomp, Q,
383                             1.0, (float *)impl->dinterp, P,
384                             (float *)du, Q,
385                             0.0, (float *)dv, P, data->queue);
386     else
387       magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans,
388                             Q, nelem*ncomp, P,
389                             1.0, (float *)impl->dinterp, P,
390                             (float *)du, P,
391                             0.0, (float *)dv, Q, data->queue);
392   }
393   break;
394 
395   case CEED_EVAL_GRAD: {
396     CeedInt P = ndof, Q = nqpt;
397     if (tmode == CEED_TRANSPOSE) {
398       CeedScalar beta = 0.0;
399       for(int d=0; d<dim; d++) {
400         if (d>0)
401           beta = 1.0;
402         magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
403                               P, nelem*ncomp, Q,
404                               1.0, (float *)(impl->dgrad + d*P*Q), P,
405                               (float *)(du + d*nelem*ncomp*Q), Q,
406                               beta, (float *)dv, P, data->queue);
407       }
408     } else {
409       for(int d=0; d< dim; d++)
410         magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans,
411                               Q, nelem*ncomp, P,
412                               1.0, (float *)(impl->dgrad + d*P*Q), P,
413                               (float *)du, P,
414                               0.0, (float *)(dv + d*nelem*ncomp*Q), Q, data->queue);
415     }
416   }
417   break;
418 
419   case CEED_EVAL_WEIGHT: {
420     if (tmode == CEED_TRANSPOSE)
421       // LCOV_EXCL_START
422       return CeedError(ceed, CEED_ERROR_BACKEND,
423                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
424     // LCOV_EXCL_STOP
425 
426     int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1;
427     int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)?
428                                        1 : 0 );
429     magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv,
430                            data->queue);
431     CeedChkBackend(ierr);
432   }
433   break;
434 
435   // LCOV_EXCL_START
436   case CEED_EVAL_DIV:
437     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
438   case CEED_EVAL_CURL:
439     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
440   case CEED_EVAL_NONE:
441     return CeedError(ceed, CEED_ERROR_BACKEND,
442                      "CEED_EVAL_NONE does not make sense in this context");
443     // LCOV_EXCL_STOP
444   }
445 
446   // must sync to ensure completeness
447   ceed_magma_queue_sync( data->queue );
448 
449   if (emode!=CEED_EVAL_WEIGHT) {
450     ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr);
451   }
452   ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr);
453   return CEED_ERROR_SUCCESS;
454 }
455 
456 #ifdef __cplusplus
457 CEED_INTERN "C"
458 #endif
459 int CeedBasisDestroy_Magma(CeedBasis basis) {
460   int ierr;
461   CeedBasis_Magma *impl;
462   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
463 
464   ierr = magma_free(impl->dqref1d); CeedChkBackend(ierr);
465   ierr = magma_free(impl->dinterp1d); CeedChkBackend(ierr);
466   ierr = magma_free(impl->dgrad1d); CeedChkBackend(ierr);
467   ierr = magma_free(impl->dqweight1d); CeedChkBackend(ierr);
468 
469   ierr = CeedFree(&impl); CeedChkBackend(ierr);
470 
471   return CEED_ERROR_SUCCESS;
472 }
473 
474 #ifdef __cplusplus
475 CEED_INTERN "C"
476 #endif
477 int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
478   int ierr;
479   CeedBasisNonTensor_Magma *impl;
480   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
481 
482   ierr = magma_free(impl->dqref); CeedChkBackend(ierr);
483   ierr = magma_free(impl->dinterp); CeedChkBackend(ierr);
484   ierr = magma_free(impl->dgrad); CeedChkBackend(ierr);
485   ierr = magma_free(impl->dqweight); CeedChkBackend(ierr);
486 
487   ierr = CeedFree(&impl); CeedChkBackend(ierr);
488 
489   return CEED_ERROR_SUCCESS;
490 }
491 
492 #ifdef __cplusplus
493 CEED_INTERN "C"
494 #endif
495 int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d,
496                                   const CeedScalar *interp1d,
497                                   const CeedScalar *grad1d,
498                                   const CeedScalar *qref1d,
499                                   const CeedScalar *qweight1d, CeedBasis basis) {
500   int ierr;
501   CeedBasis_Magma *impl;
502   Ceed ceed;
503   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
504 
505   // Check for supported parameters
506   CeedInt ncomp = 0;
507   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
508   if (ncomp > 3)
509     // LCOV_EXCL_START
510     return CeedError(ceed, CEED_ERROR_BACKEND,
511                      "Magma backend does not support tensor bases with more than 3 components");
512   // LCOV_EXCL_STOP
513   if (P1d > 10)
514     // LCOV_EXCL_START
515     return CeedError(ceed, CEED_ERROR_BACKEND,
516                      "Magma backend does not support tensor bases with more than 10 nodes in each dimension");
517   // LCOV_EXCL_STOP
518   if (Q1d > 10)
519     // LCOV_EXCL_START
520     return CeedError(ceed, CEED_ERROR_BACKEND,
521                      "Magma backend does not support tensor bases with more than 10 quadrature points in each dimension");
522   // LCOV_EXCL_STOP
523 
524   Ceed_Magma *data;
525   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
526 
527   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
528                                 CeedBasisApply_Magma); CeedChkBackend(ierr);
529   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
530                                 CeedBasisDestroy_Magma); CeedChkBackend(ierr);
531 
532   ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr);
533   ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr);
534 
535   // Copy qref1d to the GPU
536   ierr = magma_malloc((void **)&impl->dqref1d, Q1d*sizeof(qref1d[0]));
537   CeedChkBackend(ierr);
538   magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1,
539                   data->queue);
540 
541   // Copy interp1d to the GPU
542   ierr = magma_malloc((void **)&impl->dinterp1d, Q1d*P1d*sizeof(interp1d[0]));
543   CeedChkBackend(ierr);
544   magma_setvector(Q1d*P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1,
545                   data->queue);
546 
547   // Copy grad1d to the GPU
548   ierr = magma_malloc((void **)&impl->dgrad1d, Q1d*P1d*sizeof(grad1d[0]));
549   CeedChkBackend(ierr);
550   magma_setvector(Q1d*P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1,
551                   data->queue);
552 
553   // Copy qweight1d to the GPU
554   ierr = magma_malloc((void **)&impl->dqweight1d, Q1d*sizeof(qweight1d[0]));
555   CeedChkBackend(ierr);
556   magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1,
557                   data->queue);
558 
559   return CEED_ERROR_SUCCESS;
560 }
561 
562 #ifdef __cplusplus
563 CEED_INTERN "C"
564 #endif
565 int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof,
566                             CeedInt nqpts, const CeedScalar *interp,
567                             const CeedScalar *grad, const CeedScalar *qref,
568                             const CeedScalar *qweight, CeedBasis basis) {
569   int ierr;
570   CeedBasisNonTensor_Magma *impl;
571   Ceed ceed;
572   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
573 
574   Ceed_Magma *data;
575   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
576 
577   if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
578     ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
579                                   CeedBasisApplyNonTensor_f64_Magma);
580     CeedChkBackend(ierr);
581   } else {
582     ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
583                                   CeedBasisApplyNonTensor_f32_Magma);
584     CeedChkBackend(ierr);
585   }
586   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
587                                 CeedBasisDestroyNonTensor_Magma); CeedChkBackend(ierr);
588 
589   ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr);
590   ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr);
591 
592   // Copy qref to the GPU
593   ierr = magma_malloc((void **)&impl->dqref, nqpts*sizeof(qref[0]));
594   CeedChkBackend(ierr);
595   magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue);
596 
597   // Copy interp to the GPU
598   ierr = magma_malloc((void **)&impl->dinterp, nqpts*ndof*sizeof(interp[0]));
599   CeedChkBackend(ierr);
600   magma_setvector(nqpts*ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1,
601                   data->queue);
602 
603   // Copy grad to the GPU
604   ierr = magma_malloc((void **)&impl->dgrad, nqpts*ndof*dim*sizeof(grad[0]));
605   CeedChkBackend(ierr);
606   magma_setvector(nqpts*ndof*dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1,
607                   data->queue);
608 
609   // Copy qweight to the GPU
610   ierr = magma_malloc((void **)&impl->dqweight, nqpts*sizeof(qweight[0]));
611   CeedChkBackend(ierr);
612   magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1,
613                   data->queue);
614 
615   return CEED_ERROR_SUCCESS;
616 }
617