xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma.c (revision dc1dbf0744997d47eb98656f475bfd131b09dcf8)
1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
3 // reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include "ceed-magma.h"
18 #include <string.h>
19 
20 typedef struct {
21   CeedScalar *array;
22   CeedScalar *darray;
23   int  own_;
24   int down_;
25 } CeedVector_Magma;
26 
27 typedef struct {
28   CeedInt *indices;
29   CeedInt *dindices;
30   int  own_;
31   int down_;            // cover a case where we own Device memory
32 } CeedElemRestriction_Magma;
33 
34 typedef struct {
35   const CeedScalar **inputs;
36   CeedScalar **outputs;
37 } CeedQFunction_Magma;
38 
39 typedef struct {
40   CeedVector *Evecs; /// E-vectors needed to apply operator (in followed by out)
41   CeedScalar **Edata;
42   CeedVector *evecsin;   /// Input E-vectors needed to apply operator
43   CeedVector *evecsout;   /// Output E-vectors needed to apply operator
44   CeedVector *qvecsin;   /// Input Q-vectors needed to apply operator
45   CeedVector *qvecsout;   /// Output Q-vectors needed to apply operator
46   CeedInt    numein;
47   CeedInt    numeout;
48 } CeedOperator_Magma;
49 
50 // *****************************************************************************
51 // * Initialize vector vec (after free mem) with values from array based on cmode
52 // *   CEED_COPY_VALUES: memory is allocated in vec->array_allocated, made equal
53 // *                     to array, and data is copied (not store passed pointer)
54 // *   CEED_OWN_POINTER: vec->data->array_allocated and vec->data->array = array
55 // *   CEED_USE_POINTER: vec->data->array = array (can modify; no ownership)
56 // * mtype: CEED_MEM_HOST or CEED_MEM_DEVICE
57 // *****************************************************************************
58 static int CeedVectorSetArray_Magma(CeedVector vec, CeedMemType mtype,
59                                     CeedCopyMode cmode, CeedScalar *array) {
60   CeedVector_Magma *impl = vec->data;
61   int ierr;
62 
63   // If own data, free the "old" data, e.g., as it may be of different size
64   if (impl->own_) {
65     magma_free( impl->darray );
66     magma_free_pinned( impl->array );
67     impl->darray = NULL;
68     impl->array  = NULL;
69     impl->own_ = 0;
70     impl->down_= 0;
71   }
72 
73   if (mtype == CEED_MEM_HOST) {
74     // memory is on the host; own_ = 0
75     switch (cmode) {
76     case CEED_COPY_VALUES:
77       ierr = magma_malloc( (void**)&impl->darray,
78                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
79       ierr = magma_malloc_pinned( (void**)&impl->array,
80                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
81       impl->own_ = 1;
82 
83       if (array != NULL)
84         magma_setvector(vec->length, sizeof(array[0]),
85                         array, 1, impl->darray, 1);
86       break;
87     case CEED_OWN_POINTER:
88       ierr = magma_malloc( (void**)&impl->darray,
89                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
90       // TODO: possible problem here is if we are passed non-pinned memory;
91       //       (as we own it, lter in destroy, we use free for pinned memory).
92       impl->array = array;
93       impl->own_ = 1;
94 
95       if (array != NULL)
96         magma_setvector(vec->length, sizeof(array[0]),
97                         array, 1, impl->darray, 1);
98       break;
99     case CEED_USE_POINTER:
100       ierr = magma_malloc( (void**)&impl->darray,
101                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
102       magma_setvector(vec->length, sizeof(array[0]),
103                       array, 1, impl->darray, 1);
104 
105       impl->down_  = 1;
106       impl->array  = array;
107     }
108   } else if (mtype == CEED_MEM_DEVICE) {
109     // memory is on the device; own = 0
110     switch (cmode) {
111     case CEED_COPY_VALUES:
112       ierr = magma_malloc( (void**)&impl->darray,
113                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
114       ierr = magma_malloc_pinned( (void**)&impl->array,
115                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
116       impl->own_ = 1;
117 
118       if (array)
119         magma_copyvector(vec->length, sizeof(array[0]),
120                          array, 1, impl->darray, 1);
121       else
122         // t30 assumes allocation initializes with 0s
123         magma_setvector(vec->length, sizeof(array[0]),
124                         impl->array, 1, impl->darray, 1);
125       break;
126     case CEED_OWN_POINTER:
127       impl->darray = array;
128       ierr = magma_malloc_pinned( (void**)&impl->array,
129                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
130       impl->own_ = 1;
131 
132       break;
133     case CEED_USE_POINTER:
134       impl->darray = array;
135       impl->array  = NULL;
136     }
137 
138   } else
139     return CeedError(vec->ceed, 1, "Only MemType = HOST or DEVICE supported");
140 
141   return 0;
142 }
143 
144 // *****************************************************************************
145 // * Give data pointer from vector vec to array (on HOST or DEVICE)
146 // *****************************************************************************
147 static int CeedVectorGetArray_Magma(CeedVector vec, CeedMemType mtype,
148                                     CeedScalar **array) {
149   CeedVector_Magma *impl = vec->data;
150   int ierr;
151 
152   if (mtype == CEED_MEM_HOST) {
153     if (impl->own_) {
154       // data is owned so GPU had the most up-to-date version; copy it
155       // TTT - apparantly it doesn't have most up to date data
156       magma_getvector(vec->length, sizeof(*array[0]),
157                       impl->darray, 1, impl->array, 1);
158       CeedDebug("\033[31m[CeedVectorGetArray_Magma]");
159       //fprintf(stderr,"rrrrrrrrrrrrrrr\n");
160     } else if (impl->array == NULL) {
161       // Vector doesn't own the data and was set on GPU
162       if (impl->darray == NULL) {
163         // call was made just to allocate memory
164         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
165         CeedChk(ierr);
166       } else
167         return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
168     }
169     *array = impl->array;
170   } else if (mtype == CEED_MEM_DEVICE) {
171     if (impl->darray == NULL) {
172       // Vector doesn't own the data and was set on the CPU
173       if (impl->array == NULL) {
174         // call was made just to allocate memory
175         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
176         CeedChk(ierr);
177       } else
178         return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
179     }
180     *array = impl->darray;
181   } else
182     return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
183 
184   return 0;
185 }
186 
187 // *****************************************************************************
188 // * Give data pointer from vector vec to array (on HOST or DEVICE) to read it
189 // *****************************************************************************
190 static int CeedVectorGetArrayRead_Magma(CeedVector vec, CeedMemType mtype,
191                                         const CeedScalar **array) {
192   CeedVector_Magma *impl = vec->data;
193   int ierr;
194 
195   if (mtype == CEED_MEM_HOST) {
196     if (impl->own_) {
197       // data is owned so GPU had the most up-to-date version; copy it
198       magma_getvector(vec->length, sizeof(*array[0]),
199                       impl->darray, 1, impl->array, 1);
200     } else if (impl->array == NULL) {
201       // Vector doesn't own the data and was set on GPU
202       if (impl->darray == NULL) {
203         // call was made just to allocate memory
204         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
205         CeedChk(ierr);
206       } else
207         return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
208     }
209     *array = impl->array;
210   } else if (mtype == CEED_MEM_DEVICE) {
211     if (impl->darray == NULL) {
212       // Vector doesn't own the data and was set on the CPU
213       if (impl->array == NULL) {
214         // call was made just to allocate memory
215         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
216         CeedChk(ierr);
217       } else
218         return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
219     }
220     *array = impl->darray;
221   } else
222     return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
223 
224   return 0;
225 }
226 
227 // *****************************************************************************
228 // * There is no mtype here for array so it is not clear if we restore from HOST
229 // * memory or from DEVICE memory. We assume that it is CPU memory because if
230 // * it was GPU memory we would not call this routine at all.
231 // * Restore vector vec with values from array, where array received its values
232 // * from vec and possibly modified them.
233 // *****************************************************************************
234 static int CeedVectorRestoreArray_Magma(CeedVector vec, CeedScalar **array) {
235   CeedVector_Magma *impl = vec->data;
236 
237   // Check if the array is a CPU pointer
238   if (*array == impl->array) {
239     // Update device, if the device pointer is not NULL
240     if (impl->darray != NULL) {
241       magma_setvector(vec->length, sizeof(*array[0]),
242                       *array, 1, impl->darray, 1);
243     } else {
244       // nothing to do (case of CPU use pointer)
245     }
246 
247   } else if (impl->down_) {
248     // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
249     magma_getvector(vec->length, sizeof(*array[0]),
250                     impl->darray, 1, impl->array, 1);
251   }
252 
253   *array = NULL;
254   return 0;
255 }
256 
257 // *****************************************************************************
258 // * There is no mtype here for array so it is not clear if we restore from HOST
259 // * memory or from DEVICE memory. We assume that it is CPU memory because if
260 // * it was GPU memory we would not call this routine at all.
261 // * Restore vector vec with values from array, where array received its values
262 // * from vec to only read them; in this case vec may have been modified meanwhile
263 // * and needs to be restored here.
264 // *****************************************************************************
265 static int CeedVectorRestoreArrayRead_Magma(CeedVector vec,
266     const CeedScalar **array) {
267   CeedVector_Magma *impl = vec->data;
268 
269   // Check if the array is a CPU pointer
270   if (*array == impl->array) {
271     // Update device, if the device pointer is not NULL
272     if (impl->darray != NULL) {
273       magma_setvector(vec->length, sizeof(*array[0]),
274                       *array, 1, impl->darray, 1);
275     } else {
276       // nothing to do (case of CPU use pointer)
277     }
278 
279   } else if (impl->down_) {
280     // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
281     magma_getvector(vec->length, sizeof(*array[0]),
282                     impl->darray, 1, impl->array, 1);
283   }
284 
285   *array = NULL;
286   return 0;
287 }
288 
289 static int CeedVectorDestroy_Magma(CeedVector vec) {
290   CeedVector_Magma *impl = vec->data;
291   int ierr;
292 
293   // Free if we own the data
294   if (impl->own_) {
295     ierr = magma_free_pinned(impl->array); CeedChk(ierr);
296     ierr = magma_free(impl->darray);       CeedChk(ierr);
297   } else if (impl->down_) {
298     ierr = magma_free(impl->darray);       CeedChk(ierr);
299   }
300   ierr = CeedFree(&vec->data); CeedChk(ierr);
301   return 0;
302 }
303 
304 // *****************************************************************************
305 // * Create vector vec of size n
306 // *****************************************************************************
307 static int CeedVectorCreate_Magma(CeedInt n, CeedVector vec) {
308   CeedVector_Magma *impl;
309   int ierr;
310 
311   vec->SetArray = CeedVectorSetArray_Magma;
312   vec->GetArray = CeedVectorGetArray_Magma;
313   vec->GetArrayRead = CeedVectorGetArrayRead_Magma;
314   vec->RestoreArray = CeedVectorRestoreArray_Magma;
315   vec->RestoreArrayRead = CeedVectorRestoreArrayRead_Magma;
316   vec->Destroy = CeedVectorDestroy_Magma;
317   ierr = CeedCalloc(1,&impl); CeedChk(ierr);
318   impl->darray = NULL;
319   impl->array  = NULL;
320   impl->own_ = 0;
321   impl->down_= 0;
322   vec->data = impl;
323   return 0;
324 }
325 
326 
327 // *****************************************************************************
328 // * Apply restriction operator r to u: v = r(rmode) u
329 // *****************************************************************************
330 static int CeedElemRestrictionApply_Magma(CeedElemRestriction r,
331     CeedTransposeMode tmode,
332     CeedTransposeMode lmode, CeedVector u,
333     CeedVector v, CeedRequest *request) {
334   CeedElemRestriction_Magma *impl = r->data;
335   int ierr;
336   const CeedScalar *uu;
337   CeedScalar *vv;
338   CeedInt nelem = r->nelem, elemsize = r->elemsize, ndof = r->ndof,
339           ncomp=r->ncomp;
340   CeedInt esize = nelem * elemsize;
341 
342 #ifdef USE_MAGMA_BATCH2
343   CeedInt *dindices = impl->dindices;
344   // Get pointers on the device
345   ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &uu); CeedChk(ierr);
346   ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &vv); CeedChk(ierr);
347 #else
348   CeedInt *indices = impl->indices;
349   ierr = CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu); CeedChk(ierr);
350   ierr = CeedVectorGetArray(v, CEED_MEM_HOST, &vv); CeedChk(ierr);
351 #endif
352 
353   if (tmode == CEED_NOTRANSPOSE) {
354     // Perform: v = r * u
355     if (!impl->indices) {
356       for (CeedInt i=0; i<esize*ncomp; i++) vv[i] = uu[i];
357     } else if (ncomp == 1) {
358 #ifdef USE_MAGMA_BATCH2
359 magma_template<<i=0:esize>>
360       (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
361         vv[i] = uu[dindices[i]];
362       }
363 #else
364       for (CeedInt i=0; i<esize; i++) vv[i] = uu[indices[i]];
365 #endif
366     } else {
367       // vv is (elemsize x ncomp x nelem), column-major
368       if (lmode == CEED_NOTRANSPOSE) { // u is (ndof x ncomp), column-major
369 #ifdef USE_MAGMA_BATCH2
370 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
371         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, int ndof) {
372           vv[i + iend*(d+dend*e)] = uu[dindices[i+iend*e]+ndof*d];
373         }
374 #else
375         for (CeedInt e = 0; e < nelem; e++)
376           for (CeedInt d = 0; d < ncomp; d++)
377             for (CeedInt i=0; i < elemsize; i++) {
378               vv[i + elemsize*(d+ncomp*e)] =
379                 uu[indices[i+elemsize*e]+ndof*d];
380             }
381 #endif
382       } else { // u is (ncomp x ndof), column-major
383 #ifdef USE_MAGMA_BATCH2
384 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
385         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
386           vv[i + iend*(d+dend*e)] = uu[d+dend*dindices[i + iend*e]];
387         }
388 #else
389         for (CeedInt e = 0; e < nelem; e++)
390           for (CeedInt d = 0; d < ncomp; d++)
391             for (CeedInt i=0; i< elemsize; i++) {
392               vv[i + elemsize*(d+ncomp*e)] =
393                 uu[d+ncomp*indices[i+elemsize*e]];
394             }
395 #endif
396       }
397     }
398   } else {
399     // Note: in transpose mode, we perform: v += r^t * u
400     if (!impl->indices) {
401       for (CeedInt i=0; i<esize; i++) vv[i] += uu[i];
402     } else if (ncomp == 1) {
403       // fprintf(stderr,"3 ---------\n");
404 #ifdef USE_MAGMA_BATCH2
405 magma_template<<i=0:esize>>
406       (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
407         magmablas_datomic_add( &vv[dindices[i]], uu[i]);
408       }
409 #else
410       for (CeedInt i=0; i<esize; i++) vv[indices[i]] += uu[i];
411 #endif
412     } else { // u is (elemsize x ncomp x nelem)
413       fprintf(stderr,"2 ---------\n");
414 
415       if (lmode == CEED_NOTRANSPOSE) { // vv is (ndof x ncomp), column-major
416 #ifdef USE_MAGMA_BATCH2
417 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
418         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, CeedInt ndof) {
419           magmablas_datomic_add( &vv[dindices[i+iend*e]+ndof*d],
420                                  uu[i+iend*(d+e*dend)]);
421         }
422 #else
423         for (CeedInt e = 0; e < nelem; e++)
424           for (CeedInt d = 0; d < ncomp; d++)
425             for (CeedInt i=0; i < elemsize; i++) {
426               vv[indices[i + elemsize*e]+ndof*d] +=
427                 uu[i + elemsize*(d+e*ncomp)];
428             }
429 #endif
430       } else { // vv is (ncomp x ndof), column-major
431 #ifdef USE_MAGMA_BATCH2
432 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
433         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
434           magmablas_datomic_add( &vv[d+dend*dindices[i + iend*e]],
435                                  uu[i+iend*(d+e*dend)]);
436         }
437 #else
438         for (CeedInt e = 0; e < nelem; e++)
439           for (CeedInt d = 0; d < ncomp; d++)
440             for (CeedInt i=0; i < elemsize; i++) {
441               vv[d+ncomp*indices[i + elemsize*e]] +=
442                 uu[i + elemsize*(d+e*ncomp)];
443             }
444 #endif
445       }
446     }
447   }
448 
449   ierr = CeedVectorRestoreArrayRead(u, &uu); CeedChk(ierr);
450   ierr = CeedVectorRestoreArray(v, &vv); CeedChk(ierr);
451 
452   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
453     *request = NULL;
454   return 0;
455 }
456 
457 static int CeedElemRestrictionDestroy_Magma(CeedElemRestriction r) {
458   CeedElemRestriction_Magma *impl = r->data;
459   int ierr;
460 
461   // Free if we own the data
462   if (impl->own_) {
463     ierr = magma_free_pinned(impl->indices); CeedChk(ierr);
464     ierr = magma_free(impl->dindices);       CeedChk(ierr);
465   } else if (impl->down_) {
466     ierr = magma_free(impl->dindices);       CeedChk(ierr);
467   }
468   ierr = CeedFree(&r->data); CeedChk(ierr);
469   return 0;
470 }
471 
472 static int CeedElemRestrictionCreate_Magma(CeedMemType mtype,
473     CeedCopyMode cmode,
474     const CeedInt *indices, CeedElemRestriction r) {
475   int ierr, size = r->nelem*r->elemsize;
476   CeedElemRestriction_Magma *impl;
477 
478   // Allocate memory for the MAGMA Restricton and initializa pointers to NULL
479   ierr = CeedCalloc(1,&impl); CeedChk(ierr);
480   impl->dindices = NULL;
481   impl->indices  = NULL;
482   impl->own_ = 0;
483   impl->down_= 0;
484 
485   if (mtype == CEED_MEM_HOST) {
486     // memory is on the host; own_ = 0
487     switch (cmode) {
488     case CEED_COPY_VALUES:
489       ierr = magma_malloc( (void**)&impl->dindices,
490                            size * sizeof(CeedInt)); CeedChk(ierr);
491       ierr = magma_malloc_pinned( (void**)&impl->indices,
492                                   size * sizeof(CeedInt)); CeedChk(ierr);
493       impl->own_ = 1;
494 
495       if (indices != NULL) {
496         memcpy(impl->indices, indices, size * sizeof(indices[0]));
497         magma_setvector(size, sizeof(CeedInt),
498                         impl->indices, 1, impl->dindices, 1);
499       }
500       break;
501     case CEED_OWN_POINTER:
502       ierr = magma_malloc( (void**)&impl->dindices,
503                            size * sizeof(CeedInt)); CeedChk(ierr);
504       // TODO: possible problem here is if we are passed non-pinned memory;
505       //       (as we own it, lter in destroy, we use free for pinned memory).
506       impl->indices = (CeedInt *)indices;
507       impl->own_ = 1;
508 
509       if (indices != NULL)
510         magma_setvector(size, sizeof(CeedInt),
511                         indices, 1, impl->dindices, 1);
512       break;
513     case CEED_USE_POINTER:
514       ierr = magma_malloc( (void**)&impl->dindices,
515                            size * sizeof(CeedInt)); CeedChk(ierr);
516       magma_setvector(size, sizeof(CeedInt),
517                       indices, 1, impl->dindices, 1);
518       impl->down_ = 1;
519       impl->indices  = (CeedInt *)indices;
520     }
521   } else if (mtype == CEED_MEM_DEVICE) {
522     // memory is on the device; own = 0
523     switch (cmode) {
524     case CEED_COPY_VALUES:
525       ierr = magma_malloc( (void**)&impl->dindices,
526                            size * sizeof(CeedInt)); CeedChk(ierr);
527       ierr = magma_malloc_pinned( (void**)&impl->indices,
528                                   size * sizeof(CeedInt)); CeedChk(ierr);
529       impl->own_ = 1;
530 
531       if (indices)
532         magma_copyvector(size, sizeof(CeedInt),
533                          indices, 1, impl->dindices, 1);
534       break;
535     case CEED_OWN_POINTER:
536       impl->dindices = (CeedInt *)indices;
537       ierr = magma_malloc_pinned( (void**)&impl->indices,
538                                   size * sizeof(CeedInt)); CeedChk(ierr);
539       impl->own_ = 1;
540 
541       break;
542     case CEED_USE_POINTER:
543       impl->dindices = (CeedInt *)indices;
544       impl->indices  = NULL;
545     }
546 
547   } else
548     return CeedError(r->ceed, 1, "Only MemType = HOST or DEVICE supported");
549 
550   r->data    = impl;
551   r->Apply   = CeedElemRestrictionApply_Magma;
552   r->Destroy = CeedElemRestrictionDestroy_Magma;
553 
554   return 0;
555 }
556 
557 static int CeedElemRestrictionCreateBlocked_Magma(CeedMemType mtype,
558     CeedCopyMode cmode,
559     const CeedInt *indices, CeedElemRestriction r) {
560   return CeedError(r->ceed, 1, "Backend does not implement blocked restrictions");
561 }
562 
563 // Contracts on the middle index
564 // NOTRANSPOSE: V_ajc = T_jb U_abc
565 // TRANSPOSE:   V_ajc = T_bj U_abc
566 // If Add != 0, "=" is replaced by "+="
567 static int CeedTensorContract_Magma(Ceed ceed,
568                                     CeedInt A, CeedInt B, CeedInt C, CeedInt J,
569                                     const CeedScalar *t, CeedTransposeMode tmode,
570                                     const CeedInt Add,
571                                     const CeedScalar *u, CeedScalar *v) {
572 #ifdef USE_MAGMA_BATCH
573   magma_dtensor_contract(ceed, A, B, C, J, t, tmode, Add, u, v);
574 #else
575   CeedInt tstride0 = B, tstride1 = 1;
576   if (tmode == CEED_TRANSPOSE) {
577     tstride0 = 1; tstride1 = J;
578   }
579   CeedDebug("\033[31m[CeedTensorContract] A=%d, J=%d, C=%d, B=%d: %d %d %d",
580             A,J,C,B,A*J*B*C, C*J*A, C*B*A);
581   for (CeedInt a=0; a<A; a++) {
582     for (CeedInt j=0; j<J; j++) {
583       if (!Add) {
584         for (CeedInt c=0; c<C; c++)
585           v[(a*J+j)*C+c] = 0;
586       }
587       for (CeedInt b=0; b<B; b++) {
588         for (CeedInt c=0; c<C; c++) {
589           v[(a*J+j)*C+c] += t[j*tstride0 + b*tstride1] * u[(a*B+b)*C+c];
590         }
591       }
592     }
593   }
594 #endif
595   return 0;
596 }
597 
598 static int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem,
599                                 CeedTransposeMode tmode, CeedEvalMode emode,
600                                 CeedVector U, CeedVector V) {
601   int ierr;
602   const CeedInt dim = basis->dim;
603   const CeedInt ncomp = basis->ncomp;
604   const CeedInt nqpt = ncomp*CeedPowInt(basis->Q1d, dim);
605   const CeedInt add = (tmode == CEED_TRANSPOSE);
606   const CeedScalar *u;
607   CeedScalar *v;
608   if (U) {
609     ierr = CeedVectorGetArrayRead(U, CEED_MEM_HOST, &u); CeedChk(ierr);
610   } else if (emode != CEED_EVAL_WEIGHT) {
611       return CeedError(ceed, 1,
612                        "An input vector is required for this CeedEvalMode");
613   }
614   ierr = CeedVectorGetArray(V, CEED_MEM_HOST, &v); CeedChk(ierr);
615 
616   if (nelem != 1)
617     return CeedError(basis->ceed, 1,
618                      "This backend does not support BasisApply for multiple elements");
619 
620   CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d",
621             ncomp*CeedPowInt(basis->P1d, dim));
622 
623   if (tmode == CEED_TRANSPOSE) {
624     const CeedInt vsize = ncomp*CeedPowInt(basis->P1d, dim);
625     for (CeedInt i = 0; i < vsize; i++)
626       v[i] = (CeedScalar) 0;
627   }
628   if (emode & CEED_EVAL_INTERP) {
629     CeedInt P = basis->P1d, Q = basis->Q1d;
630     if (tmode == CEED_TRANSPOSE) {
631       P = basis->Q1d; Q = basis->P1d;
632     }
633     CeedInt pre = ncomp*CeedPowInt(P, dim-1), post = 1;
634     CeedScalar tmp[2][ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1)];
635     CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
636               ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1));
637     for (CeedInt d=0; d<dim; d++) {
638       ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, basis->interp1d,
639                                       tmode, add&&(d==dim-1),
640                                       d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
641       CeedChk(ierr);
642       pre /= P;
643       post *= Q;
644     }
645     if (tmode == CEED_NOTRANSPOSE) {
646       v += nqpt;
647     } else {
648       u += nqpt;
649     }
650   }
651   if (emode & CEED_EVAL_GRAD) {
652     CeedInt P = basis->P1d, Q = basis->Q1d;
653     // In CEED_NOTRANSPOSE mode:
654     // u is (P^dim x nc), column-major layout (nc = ncomp)
655     // v is (Q^dim x nc x dim), column-major layout (nc = ncomp)
656     // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
657     if (tmode == CEED_TRANSPOSE) {
658       P = basis->Q1d, Q = basis->P1d;
659     }
660     CeedScalar tmp[2][ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1)];
661     CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
662               ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1));
663     for (CeedInt p = 0; p < dim; p++) {
664       CeedInt pre = ncomp*CeedPowInt(P, dim-1), post = 1;
665       for (CeedInt d=0; d<dim; d++) {
666         ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q,
667                                         (p==d)?basis->grad1d:basis->interp1d,
668                                         tmode, add&&(d==dim-1),
669                                         d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
670         CeedChk(ierr);
671         pre /= P;
672         post *= Q;
673       }
674       if (tmode == CEED_NOTRANSPOSE) {
675         v += nqpt;
676       } else {
677         u += nqpt;
678       }
679     }
680   }
681   if (emode & CEED_EVAL_WEIGHT) {
682     if (tmode == CEED_TRANSPOSE)
683       return CeedError(basis->ceed, 1,
684                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
685     CeedInt Q = basis->Q1d;
686     for (CeedInt d=0; d<dim; d++) {
687       CeedInt pre = CeedPowInt(Q, dim-d-1), post = CeedPowInt(Q, d);
688       for (CeedInt i=0; i<pre; i++) {
689         for (CeedInt j=0; j<Q; j++) {
690           for (CeedInt k=0; k<post; k++) {
691             v[(i*Q + j)*post + k] = basis->qweight1d[j]
692                                     * (d == 0 ? 1 : v[(i*Q + j)*post + k]);
693           }
694         }
695       }
696     }
697   }
698   if (U) {
699     ierr = CeedVectorRestoreArrayRead(U, &u); CeedChk(ierr);
700   }
701   ierr = CeedVectorRestoreArray(V, &v); CeedChk(ierr);
702   return 0;
703 }
704 
705 static int CeedBasisDestroy_Magma(CeedBasis basis) {
706   return 0;
707 }
708 
709 static int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d,
710     CeedInt Q1d, const CeedScalar *interp1d,
711     const CeedScalar *grad1d,
712     const CeedScalar *qref1d,
713     const CeedScalar *qweight1d,
714     CeedBasis basis) {
715   basis->Apply = CeedBasisApply_Magma;
716   basis->Destroy = CeedBasisDestroy_Magma;
717   return 0;
718 }
719 
720 static int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim,
721                           CeedInt ndof, CeedInt nqpts,
722                           const CeedScalar *interp,
723                           const CeedScalar *grad,
724                           const CeedScalar *qref,
725                           const CeedScalar *qweight,
726                           CeedBasis basis) {
727   return CeedError(basis->ceed, 1, "Backend does not implement non-tensor bases");
728 }
729 
730 static int CeedQFunctionApply_Magma(CeedQFunction qf, CeedInt Q,
731                                     CeedVector *U, CeedVector *V) {
732   int ierr;
733   CeedQFunction_Ref *impl;
734   ierr = CeedQFunctionGetData(qf, (void*)&impl); CeedChk(ierr);
735 
736   void *ctx;
737   ierr = CeedQFunctionGetContext(qf, &ctx); CeedChk(ierr);
738 
739   int (*f)() = NULL;
740   ierr = CeedQFunctionGetUserFunction(qf, (int (**)())&f); CeedChk(ierr);
741 
742   CeedInt nIn, nOut;
743   ierr = CeedQFunctionGetNumArgs(qf, &nIn, &nOut); CeedChk(ierr);
744 
745   for (int i = 0; i<nIn; i++) {
746     if (U[i]) {
747       ierr = CeedVectorGetArrayRead(U[i], CEED_MEM_HOST, &impl->inputs[i]);
748       CeedChk(ierr);
749     }
750   }
751   for (int i = 0; i<nOut; i++) {
752     if (U[i]) {
753       ierr = CeedVectorGetArray(V[i], CEED_MEM_HOST, &impl->outputs[i]);
754       CeedChk(ierr);
755     }
756   }
757 
758   ierr = f(ctx, Q, impl->inputs, impl->outputs); CeedChk(ierr);
759 
760   for (int i = 0; i<nIn; i++) {
761     if (U[i]) {
762       ierr = CeedVectorRestoreArrayRead(U[i], &impl->inputs[i]); CeedChk(ierr);
763     }
764   }
765   for (int i = 0; i<nOut; i++) {
766     if (U[i]) {
767       ierr = CeedVectorRestoreArray(V[i], &impl->outputs[i]); CeedChk(ierr);
768     }
769   }
770   return 0;
771 }
772 
773 static int CeedQFunctionDestroy_Magma(CeedQFunction qf) {
774   int ierr;
775   CeedQFunction_Magma *impl;
776   ierr = CeedQFunctionGetData(qf, (void*)&impl); CeedChk(ierr);
777 
778   ierr = CeedFree(&impl->inputs); CeedChk(ierr);
779   ierr = CeedFree(&impl->outputs); CeedChk(ierr);
780   ierr = CeedFree(&impl); CeedChk(ierr);
781 
782   return 0;
783 }
784 
785 static int CeedQFunctionCreate_Magma(CeedQFunction qf) {
786   int ierr;
787   Ceed ceed;
788   ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChk(ierr);
789 
790   CeedQFunction_Magma *impl;
791   ierr = CeedCalloc(1, &impl); CeedChk(ierr);
792   ierr = CeedCalloc(16, &impl->inputs); CeedChk(ierr);
793   ierr = CeedCalloc(16, &impl->outputs); CeedChk(ierr);
794   ierr = CeedQFunctionSetData(qf, (void*)&impl); CeedChk(ierr);
795 
796   qf->Apply = CeedQFunctionApply_Magma;
797   qf->Destroy = CeedQFunctionDestroy_Magma;
798   return 0;
799 }
800 
801 static int CeedOperatorDestroy_Magma(CeedOperator op) {
802   int ierr;
803   CeedOperator_Magma *impl;
804   ierr = CeedOperatorGetData(op, (void*)&impl); CeedChk(ierr);
805 
806   for (CeedInt i=0; i<impl->numein+impl->numeout; i++) {
807     if (impl->Evecs[i]) {
808       ierr = CeedVectorDestroy(&impl->Evecs[i]); CeedChk(ierr);
809     }
810   }
811   ierr = CeedFree(&impl->Evecs); CeedChk(ierr);
812   ierr = CeedFree(&impl->Edata); CeedChk(ierr);
813 
814   for (CeedInt i=0; i<impl->numein; i++) {
815     ierr = CeedVectorDestroy(&impl->evecsin[i]); CeedChk(ierr);
816     ierr = CeedVectorDestroy(&impl->qvecsin[i]); CeedChk(ierr);
817   }
818   ierr = CeedFree(&impl->evecsin); CeedChk(ierr);
819   ierr = CeedFree(&impl->qvecsin); CeedChk(ierr);
820 
821   for (CeedInt i=0; i<impl->numeout; i++) {
822     ierr = CeedVectorDestroy(&impl->evecsout[i]); CeedChk(ierr);
823     ierr = CeedVectorDestroy(&impl->qvecsout[i]); CeedChk(ierr);
824   }
825   ierr = CeedFree(&impl->evecsout); CeedChk(ierr);
826   ierr = CeedFree(&impl->qvecsout); CeedChk(ierr);
827 
828 
829   ierr = CeedFree(&impl); CeedChk(ierr);
830   return 0;
831 }
832 
833 
834 /*
835   Setup infields or outfields
836  */
837 static int CeedOperatorSetupFields_Magma(CeedQFunction qf, CeedOperator op,
838                                        bool inOrOut,
839                                        CeedVector *fullevecs, CeedVector *evecs,
840                                        CeedVector *qvecs, CeedInt starte,
841                                        CeedInt numfields, CeedInt Q) {
842   CeedInt dim = 1, ierr, ncomp;
843   Ceed ceed;
844   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
845   CeedQFunction_Magma *qf_data;
846   ierr = CeedQFunctionGetData(qf, (void*)&qf_data); CeedChk(ierr);
847   CeedBasis basis;
848   CeedElemRestriction Erestrict;
849   CeedOperatorField *opfields;
850   CeedQFunctionField *qffields;
851   if (inOrOut) {
852     ierr = CeedOperatorGetFields(op, NULL, &opfields);
853     CeedChk(ierr);
854     ierr = CeedQFunctionGetFields(qf, NULL, &qffields);
855     CeedChk(ierr);
856   } else {
857     ierr = CeedOperatorGetFields(op, &opfields, NULL);
858     CeedChk(ierr);
859     ierr = CeedQFunctionGetFields(qf, &qffields, NULL);
860     CeedChk(ierr);
861   }
862 
863   // Loop over fields
864   for (CeedInt i=0; i<numfields; i++) {
865     CeedEvalMode emode;
866     ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChk(ierr);
867     if (emode != CEED_EVAL_WEIGHT) {
868       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict);
869       CeedChk(ierr);
870       ierr = CeedElemRestrictionCreateVector(Erestrict, NULL, &fullevecs[i+starte]);
871       CeedChk(ierr);
872     } else {
873       }
874     switch(emode) {
875     case CEED_EVAL_NONE:
876       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
877       CeedChk(ierr);
878       ierr = CeedVectorCreate(ceed, Q*ncomp, &evecs[i]); CeedChk(ierr);
879       ierr = CeedVectorCreate(ceed, Q*ncomp, &qvecs[i]); CeedChk(ierr);
880       break; // No action
881     case CEED_EVAL_INTERP:
882       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChk(ierr);
883       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
884       CeedChk(ierr);
885       ierr = CeedVectorCreate(ceed, Q*ncomp, &evecs[i]); CeedChk(ierr);
886       ierr = CeedVectorCreate(ceed, Q*ncomp, &qvecs[i]); CeedChk(ierr);
887       break;
888     case CEED_EVAL_GRAD:
889       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChk(ierr);
890       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
891       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
892       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
893       CeedChk(ierr);
894       ierr = CeedVectorCreate(ceed, Q*ncomp, &evecs[i]); CeedChk(ierr);
895       ierr = CeedVectorCreate(ceed, Q*ncomp*dim, &qvecs[i]); CeedChk(ierr);
896       break;
897     case CEED_EVAL_WEIGHT: // Only on input fields
898       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChk(ierr);
899       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
900       ierr = CeedVectorCreate(ceed, Q, &qvecs[i]); CeedChk(ierr);
901       ierr = CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT,
902                             NULL, qvecs[i]); CeedChk(ierr);
903       assert(starte==0);
904       break;
905     case CEED_EVAL_DIV: break; // Not implemented
906     case CEED_EVAL_CURL: break; // Not implemented
907     }
908   }
909   return 0;
910 }
911 
912 /*
913   CeedOperator needs to connect all the named fields (be they active or passive)
914   to the named inputs and outputs of its CeedQFunction.
915  */
916 static int CeedOperatorSetup_Magma(CeedOperator op) {
917   int ierr;
918   bool setupdone;
919   ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
920   if (setupdone) return 0;
921   Ceed ceed;
922   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
923   CeedOperator_Magma *data;
924   ierr = CeedOperatorGetData(op, (void*)&data); CeedChk(ierr);
925   CeedQFunction qf;
926   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
927   CeedInt Q, numinputfields, numoutputfields;
928   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
929   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
930   CeedOperatorField *opinputfields, *opoutputfields;
931   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
932   CeedChk(ierr);
933   CeedQFunctionField *qfinputfields, *qfoutputfields;
934   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
935   CeedChk(ierr);
936 
937   data->numein = numinputfields;
938   data->numeout = numoutputfields;
939 
940   // Allocate
941   const CeedInt numIO = numinputfields + numoutputfields;
942 
943   ierr = CeedCalloc(numinputfields + numoutputfields, &data->Evecs);
944   CeedChk(ierr);
945   ierr = CeedCalloc(numinputfields + numoutputfields, &data->Edata);
946   CeedChk(ierr);
947 
948   ierr = CeedCalloc(16, &data->evecsin); CeedChk(ierr);
949   ierr = CeedCalloc(16, &data->evecsout); CeedChk(ierr);
950   ierr = CeedCalloc(16, &data->qvecsin); CeedChk(ierr);
951   ierr = CeedCalloc(16, &data->qvecsout); CeedChk(ierr);
952 
953   // Set up infield and outfield pointer arrays
954   // Infields
955   ierr = CeedOperatorSetupFields_Magma(qf, op, 0, data->Evecs,
956                                      data->evecsin, data->qvecsin, 0,
957                                      numinputfields, Q);
958   CeedChk(ierr);
959   // Outfields
960   ierr = CeedOperatorSetupFields_Magma(qf, op, 1, data->Evecs,
961                                      data->evecsout, data->qvecsout,
962                                      numinputfields, numoutputfields, Q);
963   CeedChk(ierr);
964   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
965   return 0;
966 }
967 
968 static int CeedOperatorApply_Magma(CeedOperator op, CeedVector invec,
969                                  CeedVector outvec, CeedRequest *request) {
970   int ierr;
971   Ceed ceed;
972   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
973   CeedOperator_Magma *data;
974   ierr = CeedOperatorGetData(op, (void*)&data); CeedChk(ierr);
975   //CeedVector *E = data->Evecs, *D = data->D, outvec;
976   CeedInt Q, elemsize, numelements, numinputfields, numoutputfields, ncomp;
977   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
978   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
979   CeedQFunction qf;
980   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
981   CeedTransposeMode lmode;
982   CeedOperatorField *opinputfields, *opoutputfields;
983   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
984   CeedChk(ierr);
985   CeedQFunctionField *qfinputfields, *qfoutputfields;
986   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
987   CeedChk(ierr);
988   CeedEvalMode emode;
989   CeedVector vec;
990   CeedBasis basis;
991   CeedElemRestriction Erestrict;
992 
993   ierr = CeedOperatorSetup_Magma(op); CeedChk(ierr);
994   ierr= CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); CeedChk(ierr);
995 
996   // Input Evecs and Restriction
997   for (CeedInt i=0; i<numinputfields; i++) {
998     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
999     CeedChk(ierr);
1000     if (emode & CEED_EVAL_WEIGHT) {
1001     } else { // Restriction
1002       // Get input vector
1003       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr);
1004       if (vec == CEED_VECTOR_ACTIVE)
1005         vec = invec;
1006       // Restrict
1007       ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1008       CeedChk(ierr);
1009       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
1010       ierr = CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE,
1011                                       lmode, vec, data->Evecs[i],
1012                                       request); CeedChk(ierr);
1013       // Get evec
1014       ierr = CeedVectorGetArrayRead(data->Evecs[i], CEED_MEM_HOST,
1015                                     (const CeedScalar **) &data->Edata[i]);
1016       CeedChk(ierr);
1017     }
1018   }
1019 
1020   // Output Evecs
1021   for (CeedInt i=0; i<numoutputfields; i++) {
1022     ierr = CeedVectorGetArray(data->Evecs[i+data->numein], CEED_MEM_HOST,
1023                               &data->Edata[i + numinputfields]); CeedChk(ierr);
1024   }
1025 
1026   // Loop through elements
1027   for (CeedInt e=0; e<numelements; e++) {
1028     // Input basis apply if needed
1029     for (CeedInt i=0; i<numinputfields; i++) {
1030       // Get elemsize, emode, ncomp
1031       ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1032       CeedChk(ierr);
1033       ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1034       CeedChk(ierr);
1035       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1036       CeedChk(ierr);
1037       ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp);
1038       CeedChk(ierr);
1039       // Basis action
1040       switch(emode) {
1041       case CEED_EVAL_NONE:
1042         ierr = CeedVectorSetArray(data->qvecsin[i], CEED_MEM_HOST,
1043                                   CEED_USE_POINTER,
1044                                   &data->Edata[i][e*Q*ncomp]); CeedChk(ierr);
1045         break;
1046       case CEED_EVAL_INTERP:
1047         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1048         ierr = CeedVectorSetArray(data->evecsin[i], CEED_MEM_HOST,
1049                                   CEED_USE_POINTER,
1050                                   &data->Edata[i][e*elemsize*ncomp]);
1051         CeedChk(ierr);
1052         ierr = CeedBasisApply(basis, 1, CEED_NOTRANSPOSE,
1053                               CEED_EVAL_INTERP, data->evecsin[i],
1054                               data->qvecsin[i]); CeedChk(ierr);
1055         break;
1056       case CEED_EVAL_GRAD:
1057         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1058         ierr = CeedVectorSetArray(data->evecsin[i], CEED_MEM_HOST,
1059                                   CEED_USE_POINTER,
1060                                   &data->Edata[i][e*elemsize*ncomp]);
1061         CeedChk(ierr);
1062         ierr = CeedBasisApply(basis, 1, CEED_NOTRANSPOSE,
1063                               CEED_EVAL_GRAD, data->evecsin[i],
1064                               data->qvecsin[i]); CeedChk(ierr);
1065         break;
1066       case CEED_EVAL_WEIGHT:
1067         break;  // No action
1068       case CEED_EVAL_DIV:
1069         break; // Not implemented
1070       case CEED_EVAL_CURL:
1071         break; // Not implemented
1072       }
1073     }
1074     // Output pointers
1075     for (CeedInt i=0; i<numoutputfields; i++) {
1076       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1077       CeedChk(ierr);
1078       if (emode == CEED_EVAL_NONE) {
1079         ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
1080         CeedChk(ierr);
1081         ierr = CeedVectorSetArray(data->qvecsout[i], CEED_MEM_HOST,
1082                                   CEED_USE_POINTER,
1083                                   &data->Edata[i + numinputfields][e*Q*ncomp]);
1084         CeedChk(ierr);
1085       }
1086       if (emode == CEED_EVAL_INTERP) {
1087       }
1088       if (emode == CEED_EVAL_GRAD) {
1089       }
1090       if (emode == CEED_EVAL_WEIGHT) {
1091       }
1092     }
1093 
1094     // Q function
1095     ierr = CeedQFunctionApply(qf, Q, data->qvecsin, data->qvecsout); CeedChk(ierr);
1096 
1097     // Output basis apply if needed
1098     for (CeedInt i=0; i<numoutputfields; i++) {
1099       // Get elemsize, emode, ncomp
1100       ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1101       CeedChk(ierr);
1102       ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1103       CeedChk(ierr);
1104       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1105       CeedChk(ierr);
1106       ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
1107       CeedChk(ierr);
1108       // Basis action
1109       switch(emode) {
1110       case CEED_EVAL_NONE:
1111         break; // No action
1112       case CEED_EVAL_INTERP:
1113         ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
1114         CeedChk(ierr);
1115         ierr = CeedVectorSetArray(data->evecsout[i], CEED_MEM_HOST,
1116                                   CEED_USE_POINTER,
1117                                   &data->Edata[i + numinputfields][e*elemsize*ncomp]);
1118         ierr = CeedBasisApply(basis, 1, CEED_TRANSPOSE,
1119                               CEED_EVAL_INTERP, data->qvecsout[i],
1120                               data->evecsout[i]); CeedChk(ierr);
1121         break;
1122       case CEED_EVAL_GRAD:
1123         ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
1124         CeedChk(ierr);
1125         ierr = CeedVectorSetArray(data->evecsout[i], CEED_MEM_HOST,
1126                                   CEED_USE_POINTER,
1127                                   &data->Edata[i + numinputfields][e*elemsize*ncomp]);
1128         ierr = CeedBasisApply(basis, 1, CEED_TRANSPOSE,
1129                               CEED_EVAL_GRAD, data->qvecsout[i],
1130                               data->evecsout[i]); CeedChk(ierr);
1131         break;
1132       case CEED_EVAL_WEIGHT: break; // Should not occur
1133       case CEED_EVAL_DIV: break; // Not implemented
1134       case CEED_EVAL_CURL: break; // Not implemented
1135       }
1136     }
1137   } // numelements
1138 
1139   // Zero lvecs
1140   for (CeedInt i=0; i<numoutputfields; i++) {
1141     ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr);
1142     if (vec == CEED_VECTOR_ACTIVE)
1143       vec = outvec;
1144     ierr = CeedVectorSetValue(vec, 0.0); CeedChk(ierr);
1145     }
1146 
1147   // Output restriction
1148   for (CeedInt i=0; i<numoutputfields; i++) {
1149     // Restore evec
1150     ierr = CeedVectorRestoreArray(data->Evecs[i+data->numein],
1151                                   &data->Edata[i + numinputfields]);
1152     CeedChk(ierr);
1153     // Get output vector
1154     ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr);
1155     // Active
1156     if (vec == CEED_VECTOR_ACTIVE)
1157       vec = outvec;
1158     // Restrict
1159     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1160     CeedChk(ierr);
1161     ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
1162     ierr = CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE,
1163                                     lmode, data->Evecs[i+data->numein], vec,
1164                                     request); CeedChk(ierr);
1165   }
1166 
1167   // Restore input arrays
1168   for (CeedInt i=0; i<numinputfields; i++) {
1169     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1170     CeedChk(ierr);
1171     if (emode & CEED_EVAL_WEIGHT) {
1172     } else {
1173       // Restriction
1174       ierr = CeedVectorRestoreArrayRead(data->Evecs[i],
1175                                         (const CeedScalar **) &data->Edata[i]);
1176       CeedChk(ierr);
1177     }
1178   }
1179   return 0;
1180 }
1181 
1182 static int CeedOperatorCreate_Magma(CeedOperator op) {
1183   int ierr;
1184   CeedOperator_Magma *impl;
1185   Ceed ceed;
1186   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1187 
1188   ierr = CeedCalloc(1, &impl); CeedChk(ierr);
1189   ierr = CeedOperatorSetData(op, (void*)&impl);
1190 
1191   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Apply",
1192                                 CeedOperatorApply_Magma); CeedChk(ierr);
1193   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy",
1194                                 CeedOperatorDestroy_Magma); CeedChk(ierr);
1195   return 0;
1196 }
1197 
1198 // *****************************************************************************
1199 // * INIT
1200 // *****************************************************************************
1201 static int CeedInit_Magma(const char *resource, Ceed ceed) {
1202   int ierr;
1203   if (strcmp(resource, "/gpu/magma"))
1204     return CeedError(ceed, 1, "MAGMA backend cannot use resource: %s", resource);
1205 
1206   ierr = magma_init();
1207   if (ierr) return CeedError(ceed, 1, "error in magma_init(): %d\n", ierr);
1208   //magma_print_environment();
1209 
1210   ceed->VecCreate = CeedVectorCreate_Magma;
1211   ceed->BasisCreateTensorH1 = CeedBasisCreateTensorH1_Magma;
1212   ceed->BasisCreateH1 = CeedBasisCreateH1_Magma;
1213   ceed->ElemRestrictionCreate = CeedElemRestrictionCreate_Magma;
1214   ceed->ElemRestrictionCreateBlocked = CeedElemRestrictionCreateBlocked_Magma;
1215   ceed->QFunctionCreate = CeedQFunctionCreate_Magma;
1216   ceed->OperatorCreate = CeedOperatorCreate_Magma;
1217   return 0;
1218 }
1219 
1220 // *****************************************************************************
1221 // * REGISTER
1222 // *****************************************************************************
1223 __attribute__((constructor))
1224 static void Register(void) {
1225   CeedRegister("/gpu/magma", CeedInit_Magma, 20);
1226 }
1227