xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma.c (revision f8902d9e98717dcaf6fb3f91664d427ddd6be1e8)
1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
3 // reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include "ceed-magma.h"
18 #include <string.h>
19 
20 typedef struct {
21   CeedScalar *array;
22   CeedScalar *darray;
23   int  own_;
24   int down_;
25 } CeedVector_Magma;
26 
27 typedef struct {
28   CeedInt *indices;
29   CeedInt *dindices;
30   int  own_;
31   int down_;            // cover a case where we own Device memory
32 } CeedElemRestriction_Magma;
33 
34 typedef struct {
35   const CeedScalar **inputs;
36   CeedScalar **outputs;
37 } CeedQFunction_Magma;
38 
39 typedef struct {
40   CeedVector *Evecs; /// E-vectors needed to apply operator (in followed by out)
41   CeedScalar **Edata;
42   CeedVector *evecsin;   /// Input E-vectors needed to apply operator
43   CeedVector *evecsout;   /// Output E-vectors needed to apply operator
44   CeedVector *qvecsin;   /// Input Q-vectors needed to apply operator
45   CeedVector *qvecsout;   /// Output Q-vectors needed to apply operator
46   CeedInt    numein;
47   CeedInt    numeout;
48 } CeedOperator_Magma;
49 
50 // *****************************************************************************
51 // * Initialize vector vec (after free mem) with values from array based on cmode
52 // *   CEED_COPY_VALUES: memory is allocated in vec->array_allocated, made equal
53 // *                     to array, and data is copied (not store passed pointer)
54 // *   CEED_OWN_POINTER: vec->data->array_allocated and vec->data->array = array
55 // *   CEED_USE_POINTER: vec->data->array = array (can modify; no ownership)
56 // * mtype: CEED_MEM_HOST or CEED_MEM_DEVICE
57 // *****************************************************************************
58 static int CeedVectorSetArray_Magma(CeedVector vec, CeedMemType mtype,
59                                     CeedCopyMode cmode, CeedScalar *array) {
60   CeedVector_Magma *impl = vec->data;
61   int ierr;
62 
63   // If own data, free the "old" data, e.g., as it may be of different size
64   if (impl->own_) {
65     magma_free( impl->darray );
66     magma_free_pinned( impl->array );
67     impl->darray = NULL;
68     impl->array  = NULL;
69     impl->own_ = 0;
70     impl->down_= 0;
71   }
72 
73   if (mtype == CEED_MEM_HOST) {
74     // memory is on the host; own_ = 0
75     switch (cmode) {
76     case CEED_COPY_VALUES:
77       ierr = magma_malloc( (void **)&impl->darray,
78                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
79       ierr = magma_malloc_pinned( (void **)&impl->array,
80                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
81       impl->own_ = 1;
82 
83       if (array != NULL)
84         magma_setvector(vec->length, sizeof(array[0]),
85                         array, 1, impl->darray, 1);
86       break;
87     case CEED_OWN_POINTER:
88       ierr = magma_malloc( (void **)&impl->darray,
89                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
90       // TODO: possible problem here is if we are passed non-pinned memory;
91       //       (as we own it, lter in destroy, we use free for pinned memory).
92       impl->array = array;
93       impl->own_ = 1;
94 
95       if (array != NULL)
96         magma_setvector(vec->length, sizeof(array[0]),
97                         array, 1, impl->darray, 1);
98       break;
99     case CEED_USE_POINTER:
100       ierr = magma_malloc( (void **)&impl->darray,
101                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
102       magma_setvector(vec->length, sizeof(array[0]),
103                       array, 1, impl->darray, 1);
104 
105       impl->down_  = 1;
106       impl->array  = array;
107     }
108   } else if (mtype == CEED_MEM_DEVICE) {
109     // memory is on the device; own = 0
110     switch (cmode) {
111     case CEED_COPY_VALUES:
112       ierr = magma_malloc( (void **)&impl->darray,
113                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
114       ierr = magma_malloc_pinned( (void **)&impl->array,
115                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
116       impl->own_ = 1;
117 
118       if (array)
119         magma_copyvector(vec->length, sizeof(array[0]),
120                          array, 1, impl->darray, 1);
121       else
122         // t30 assumes allocation initializes with 0s
123         magma_setvector(vec->length, sizeof(array[0]),
124                         impl->array, 1, impl->darray, 1);
125       break;
126     case CEED_OWN_POINTER:
127       impl->darray = array;
128       ierr = magma_malloc_pinned( (void **)&impl->array,
129                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
130       impl->own_ = 1;
131 
132       break;
133     case CEED_USE_POINTER:
134       impl->darray = array;
135       impl->array  = NULL;
136     }
137 
138   } else
139     return CeedError(vec->ceed, 1, "Only MemType = HOST or DEVICE supported");
140 
141   return 0;
142 }
143 
144 // *****************************************************************************
145 // * Give data pointer from vector vec to array (on HOST or DEVICE)
146 // *****************************************************************************
147 static int CeedVectorGetArray_Magma(CeedVector vec, CeedMemType mtype,
148                                     CeedScalar **array) {
149   CeedVector_Magma *impl = vec->data;
150   int ierr;
151 
152   if (mtype == CEED_MEM_HOST) {
153     if (impl->own_) {
154       // data is owned so GPU had the most up-to-date version; copy it
155       // TTT - apparantly it doesn't have most up to date data
156       magma_getvector(vec->length, sizeof(*array[0]),
157                       impl->darray, 1, impl->array, 1);
158       CeedDebug("\033[31m[CeedVectorGetArray_Magma]");
159       //fprintf(stderr,"rrrrrrrrrrrrrrr\n");
160     } else if (impl->array == NULL) {
161       // Vector doesn't own the data and was set on GPU
162       if (impl->darray == NULL) {
163         // call was made just to allocate memory
164         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
165         CeedChk(ierr);
166       } else
167         return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
168     }
169     *array = impl->array;
170   } else if (mtype == CEED_MEM_DEVICE) {
171     if (impl->darray == NULL) {
172       // Vector doesn't own the data and was set on the CPU
173       if (impl->array == NULL) {
174         // call was made just to allocate memory
175         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
176         CeedChk(ierr);
177       } else
178         return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
179     }
180     *array = impl->darray;
181   } else
182     return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
183 
184   return 0;
185 }
186 
187 // *****************************************************************************
188 // * Give data pointer from vector vec to array (on HOST or DEVICE) to read it
189 // *****************************************************************************
190 static int CeedVectorGetArrayRead_Magma(CeedVector vec, CeedMemType mtype,
191                                         const CeedScalar **array) {
192   CeedVector_Magma *impl = vec->data;
193   int ierr;
194 
195   if (mtype == CEED_MEM_HOST) {
196     if (impl->own_) {
197       // data is owned so GPU had the most up-to-date version; copy it
198       magma_getvector(vec->length, sizeof(*array[0]),
199                       impl->darray, 1, impl->array, 1);
200     } else if (impl->array == NULL) {
201       // Vector doesn't own the data and was set on GPU
202       if (impl->darray == NULL) {
203         // call was made just to allocate memory
204         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
205         CeedChk(ierr);
206       } else
207         return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
208     }
209     *array = impl->array;
210   } else if (mtype == CEED_MEM_DEVICE) {
211     if (impl->darray == NULL) {
212       // Vector doesn't own the data and was set on the CPU
213       if (impl->array == NULL) {
214         // call was made just to allocate memory
215         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
216         CeedChk(ierr);
217       } else
218         return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
219     }
220     *array = impl->darray;
221   } else
222     return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
223 
224   return 0;
225 }
226 
227 // *****************************************************************************
228 // * There is no mtype here for array so it is not clear if we restore from HOST
229 // * memory or from DEVICE memory. We assume that it is CPU memory because if
230 // * it was GPU memory we would not call this routine at all.
231 // * Restore vector vec with values from array, where array received its values
232 // * from vec and possibly modified them.
233 // *****************************************************************************
234 static int CeedVectorRestoreArray_Magma(CeedVector vec) {
235   CeedVector_Magma *impl = vec->data;
236 
237   if (impl->down_) {
238     // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
239     magma_getvector(vec->length, sizeof(*array[0]),
240                     impl->darray, 1, impl->array, 1);
241   }
242   return 0;
243 }
244 
245 // *****************************************************************************
246 // * There is no mtype here for array so it is not clear if we restore from HOST
247 // * memory or from DEVICE memory. We assume that it is CPU memory because if
248 // * it was GPU memory we would not call this routine at all.
249 // * Restore vector vec with values from array, where array received its values
250 // * from vec to only read them; in this case vec may have been modified meanwhile
251 // * and needs to be restored here.
252 // *****************************************************************************
253 static int CeedVectorRestoreArrayRead_Magma(CeedVector vec) {
254   CeedVector_Magma *impl = vec->data;
255 
256   if (impl->down_) {
257     // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
258     magma_getvector(vec->length, sizeof(*array[0]),
259                     impl->darray, 1, impl->array, 1);
260   }
261   return 0;
262 }
263 
264 static int CeedVectorDestroy_Magma(CeedVector vec) {
265   CeedVector_Magma *impl = vec->data;
266   int ierr;
267 
268   // Free if we own the data
269   if (impl->own_) {
270     ierr = magma_free_pinned(impl->array); CeedChk(ierr);
271     ierr = magma_free(impl->darray);       CeedChk(ierr);
272   } else if (impl->down_) {
273     ierr = magma_free(impl->darray);       CeedChk(ierr);
274   }
275   ierr = CeedFree(&vec->data); CeedChk(ierr);
276   return 0;
277 }
278 
279 // *****************************************************************************
280 // * Create vector vec of size n
281 // *****************************************************************************
282 static int CeedVectorCreate_Magma(CeedInt n, CeedVector vec) {
283   CeedVector_Magma *impl;
284   int ierr;
285 
286   vec->SetArray = CeedVectorSetArray_Magma;
287   vec->GetArray = CeedVectorGetArray_Magma;
288   vec->GetArrayRead = CeedVectorGetArrayRead_Magma;
289   vec->RestoreArray = CeedVectorRestoreArray_Magma;
290   vec->RestoreArrayRead = CeedVectorRestoreArrayRead_Magma;
291   vec->Destroy = CeedVectorDestroy_Magma;
292   ierr = CeedCalloc(1,&impl); CeedChk(ierr);
293   impl->darray = NULL;
294   impl->array  = NULL;
295   impl->own_ = 0;
296   impl->down_= 0;
297   vec->data = impl;
298   return 0;
299 }
300 
301 
302 // *****************************************************************************
303 // * Apply restriction operator r to u: v = r(rmode) u
304 // *****************************************************************************
305 static int CeedElemRestrictionApply_Magma(CeedElemRestriction r,
306     CeedTransposeMode tmode,
307     CeedTransposeMode lmode, CeedVector u,
308     CeedVector v, CeedRequest *request) {
309   CeedElemRestriction_Magma *impl = r->data;
310   int ierr;
311   const CeedScalar *uu;
312   CeedScalar *vv;
313   CeedInt nelem = r->nelem, elemsize = r->elemsize, ndof = r->ndof,
314           ncomp=r->ncomp;
315   CeedInt esize = nelem * elemsize;
316 
317   #ifdef USE_MAGMA_BATCH2
318   CeedInt *dindices = impl->dindices;
319   // Get pointers on the device
320   ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &uu); CeedChk(ierr);
321   ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &vv); CeedChk(ierr);
322   #else
323   CeedInt *indices = impl->indices;
324   ierr = CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu); CeedChk(ierr);
325   ierr = CeedVectorGetArray(v, CEED_MEM_HOST, &vv); CeedChk(ierr);
326   #endif
327 
328   if (tmode == CEED_NOTRANSPOSE) {
329     // Perform: v = r * u
330     if (!impl->indices) {
331       for (CeedInt i=0; i<esize*ncomp; i++) vv[i] = uu[i];
332     } else if (ncomp == 1) {
333       #ifdef USE_MAGMA_BATCH2
334 magma_template<<i=0:esize>>
335       (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
336         vv[i] = uu[dindices[i]];
337       }
338       #else
339       for (CeedInt i=0; i<esize; i++) vv[i] = uu[indices[i]];
340       #endif
341     } else {
342       // vv is (elemsize x ncomp x nelem), column-major
343       if (lmode == CEED_NOTRANSPOSE) { // u is (ndof x ncomp), column-major
344         #ifdef USE_MAGMA_BATCH2
345 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
346         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, int ndof) {
347           vv[i + iend*(d+dend*e)] = uu[dindices[i+iend*e]+ndof*d];
348         }
349         #else
350         for (CeedInt e = 0; e < nelem; e++)
351           for (CeedInt d = 0; d < ncomp; d++)
352             for (CeedInt i=0; i < elemsize; i++) {
353               vv[i + elemsize*(d+ncomp*e)] =
354                 uu[indices[i+elemsize*e]+ndof*d];
355             }
356         #endif
357       } else { // u is (ncomp x ndof), column-major
358         #ifdef USE_MAGMA_BATCH2
359 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
360         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
361           vv[i + iend*(d+dend*e)] = uu[d+dend*dindices[i + iend*e]];
362         }
363         #else
364         for (CeedInt e = 0; e < nelem; e++)
365           for (CeedInt d = 0; d < ncomp; d++)
366             for (CeedInt i=0; i< elemsize; i++) {
367               vv[i + elemsize*(d+ncomp*e)] =
368                 uu[d+ncomp*indices[i+elemsize*e]];
369             }
370         #endif
371       }
372     }
373   } else {
374     // Note: in transpose mode, we perform: v += r^t * u
375     if (!impl->indices) {
376       for (CeedInt i=0; i<esize; i++) vv[i] += uu[i];
377     } else if (ncomp == 1) {
378       // fprintf(stderr,"3 ---------\n");
379       #ifdef USE_MAGMA_BATCH2
380 magma_template<<i=0:esize>>
381       (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
382         magmablas_datomic_add( &vv[dindices[i]], uu[i]);
383       }
384       #else
385       for (CeedInt i=0; i<esize; i++) vv[indices[i]] += uu[i];
386       #endif
387     } else { // u is (elemsize x ncomp x nelem)
388       fprintf(stderr,"2 ---------\n");
389 
390       if (lmode == CEED_NOTRANSPOSE) { // vv is (ndof x ncomp), column-major
391         #ifdef USE_MAGMA_BATCH2
392 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
393         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, CeedInt ndof) {
394           magmablas_datomic_add( &vv[dindices[i+iend*e]+ndof*d],
395                                  uu[i+iend*(d+e*dend)]);
396         }
397         #else
398         for (CeedInt e = 0; e < nelem; e++)
399           for (CeedInt d = 0; d < ncomp; d++)
400             for (CeedInt i=0; i < elemsize; i++) {
401               vv[indices[i + elemsize*e]+ndof*d] +=
402                 uu[i + elemsize*(d+e*ncomp)];
403             }
404         #endif
405       } else { // vv is (ncomp x ndof), column-major
406         #ifdef USE_MAGMA_BATCH2
407 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
408         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
409           magmablas_datomic_add( &vv[d+dend*dindices[i + iend*e]],
410                                  uu[i+iend*(d+e*dend)]);
411         }
412         #else
413         for (CeedInt e = 0; e < nelem; e++)
414           for (CeedInt d = 0; d < ncomp; d++)
415             for (CeedInt i=0; i < elemsize; i++) {
416               vv[d+ncomp*indices[i + elemsize*e]] +=
417                 uu[i + elemsize*(d+e*ncomp)];
418             }
419         #endif
420       }
421     }
422   }
423 
424   ierr = CeedVectorRestoreArrayRead(u, &uu); CeedChk(ierr);
425   ierr = CeedVectorRestoreArray(v, &vv); CeedChk(ierr);
426 
427   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
428     *request = NULL;
429   return 0;
430 }
431 
432 int CeedElemRestrictionApplyBlock_Magma(CeedElemRestriction r,
433     CeedInt block, CeedTransposeMode tmode, CeedTransposeMode lmode,
434     CeedVector u, CeedVector v, CeedRequest *request) {
435   int ierr;
436   Ceed ceed;
437   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChk(ierr);
438   return CeedError(ceed, 1, "Backend does not implement blocked restrictions");
439 }
440 
441 static int CeedElemRestrictionDestroy_Magma(CeedElemRestriction r) {
442   CeedElemRestriction_Magma *impl = r->data;
443   int ierr;
444 
445   // Free if we own the data
446   if (impl->own_) {
447     ierr = magma_free_pinned(impl->indices); CeedChk(ierr);
448     ierr = magma_free(impl->dindices);       CeedChk(ierr);
449   } else if (impl->down_) {
450     ierr = magma_free(impl->dindices);       CeedChk(ierr);
451   }
452   ierr = CeedFree(&r->data); CeedChk(ierr);
453   return 0;
454 }
455 
456 static int CeedElemRestrictionCreate_Magma(CeedMemType mtype,
457     CeedCopyMode cmode,
458     const CeedInt *indices, CeedElemRestriction r) {
459   int ierr, size = r->nelem*r->elemsize;
460   CeedElemRestriction_Magma *impl;
461 
462   // Allocate memory for the MAGMA Restricton and initializa pointers to NULL
463   ierr = CeedCalloc(1,&impl); CeedChk(ierr);
464   impl->dindices = NULL;
465   impl->indices  = NULL;
466   impl->own_ = 0;
467   impl->down_= 0;
468 
469   if (mtype == CEED_MEM_HOST) {
470     // memory is on the host; own_ = 0
471     switch (cmode) {
472     case CEED_COPY_VALUES:
473       ierr = magma_malloc( (void **)&impl->dindices,
474                            size * sizeof(CeedInt)); CeedChk(ierr);
475       ierr = magma_malloc_pinned( (void **)&impl->indices,
476                                   size * sizeof(CeedInt)); CeedChk(ierr);
477       impl->own_ = 1;
478 
479       if (indices != NULL) {
480         memcpy(impl->indices, indices, size * sizeof(indices[0]));
481         magma_setvector(size, sizeof(CeedInt),
482                         impl->indices, 1, impl->dindices, 1);
483       }
484       break;
485     case CEED_OWN_POINTER:
486       ierr = magma_malloc( (void **)&impl->dindices,
487                            size * sizeof(CeedInt)); CeedChk(ierr);
488       // TODO: possible problem here is if we are passed non-pinned memory;
489       //       (as we own it, lter in destroy, we use free for pinned memory).
490       impl->indices = (CeedInt *)indices;
491       impl->own_ = 1;
492 
493       if (indices != NULL)
494         magma_setvector(size, sizeof(CeedInt),
495                         indices, 1, impl->dindices, 1);
496       break;
497     case CEED_USE_POINTER:
498       ierr = magma_malloc( (void **)&impl->dindices,
499                            size * sizeof(CeedInt)); CeedChk(ierr);
500       magma_setvector(size, sizeof(CeedInt),
501                       indices, 1, impl->dindices, 1);
502       impl->down_ = 1;
503       impl->indices  = (CeedInt *)indices;
504     }
505   } else if (mtype == CEED_MEM_DEVICE) {
506     // memory is on the device; own = 0
507     switch (cmode) {
508     case CEED_COPY_VALUES:
509       ierr = magma_malloc( (void **)&impl->dindices,
510                            size * sizeof(CeedInt)); CeedChk(ierr);
511       ierr = magma_malloc_pinned( (void **)&impl->indices,
512                                   size * sizeof(CeedInt)); CeedChk(ierr);
513       impl->own_ = 1;
514 
515       if (indices)
516         magma_copyvector(size, sizeof(CeedInt),
517                          indices, 1, impl->dindices, 1);
518       break;
519     case CEED_OWN_POINTER:
520       impl->dindices = (CeedInt *)indices;
521       ierr = magma_malloc_pinned( (void **)&impl->indices,
522                                   size * sizeof(CeedInt)); CeedChk(ierr);
523       impl->own_ = 1;
524 
525       break;
526     case CEED_USE_POINTER:
527       impl->dindices = (CeedInt *)indices;
528       impl->indices  = NULL;
529     }
530 
531   } else
532     return CeedError(r->ceed, 1, "Only MemType = HOST or DEVICE supported");
533 
534   r->data    = impl;
535   r->Apply   = CeedElemRestrictionApply_Magma;
536   r->ApplyBlock = CeedElemRestrictionApplyBlock_Magma;
537   r->Destroy = CeedElemRestrictionDestroy_Magma;
538 
539   return 0;
540 }
541 
542 static int CeedElemRestrictionCreateBlocked_Magma(CeedMemType mtype,
543     CeedCopyMode cmode,
544     const CeedInt *indices, CeedElemRestriction r) {
545   return CeedError(r->ceed, 1, "Backend does not implement blocked restrictions");
546 }
547 
548 // Contracts on the middle index
549 // NOTRANSPOSE: V_ajc = T_jb U_abc
550 // TRANSPOSE:   V_ajc = T_bj U_abc
551 // If Add != 0, "=" is replaced by "+="
552 static int CeedTensorContract_Magma(Ceed ceed,
553                                     CeedInt A, CeedInt B, CeedInt C, CeedInt J,
554                                     const CeedScalar *t, CeedTransposeMode tmode,
555                                     const CeedInt Add,
556                                     const CeedScalar *u, CeedScalar *v) {
557   #ifdef USE_MAGMA_BATCH
558   magma_dtensor_contract(ceed, A, B, C, J, t, tmode, Add, u, v);
559   #else
560   CeedInt tstride0 = B, tstride1 = 1;
561   if (tmode == CEED_TRANSPOSE) {
562     tstride0 = 1; tstride1 = J;
563   }
564   CeedDebug("\033[31m[CeedTensorContract] A=%d, J=%d, C=%d, B=%d: %d %d %d",
565             A,J,C,B,A*J*B*C, C*J*A, C*B*A);
566   for (CeedInt a=0; a<A; a++) {
567     for (CeedInt j=0; j<J; j++) {
568       if (!Add) {
569         for (CeedInt c=0; c<C; c++)
570           v[(a*J+j)*C+c] = 0;
571       }
572       for (CeedInt b=0; b<B; b++) {
573         for (CeedInt c=0; c<C; c++) {
574           v[(a*J+j)*C+c] += t[j*tstride0 + b*tstride1] * u[(a*B+b)*C+c];
575         }
576       }
577     }
578   }
579   #endif
580   return 0;
581 }
582 
583 static int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem,
584                                 CeedTransposeMode tmode, CeedEvalMode emode,
585                                 CeedVector U, CeedVector V) {
586   int ierr;
587   const CeedInt dim = basis->dim;
588   const CeedInt ncomp = basis->ncomp;
589   const CeedInt nqpt = ncomp*CeedPowInt(basis->Q1d, dim);
590   const CeedInt add = (tmode == CEED_TRANSPOSE);
591   const CeedScalar *u;
592   CeedScalar *v;
593   if (U) {
594     ierr = CeedVectorGetArrayRead(U, CEED_MEM_HOST, &u); CeedChk(ierr);
595   } else if (emode != CEED_EVAL_WEIGHT) {
596     return CeedError(ceed, 1,
597                      "An input vector is required for this CeedEvalMode");
598   }
599   ierr = CeedVectorGetArray(V, CEED_MEM_HOST, &v); CeedChk(ierr);
600 
601   if (nelem != 1)
602     return CeedError(basis->ceed, 1,
603                      "This backend does not support BasisApply for multiple elements");
604 
605   CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d",
606             ncomp*CeedPowInt(basis->P1d, dim));
607 
608   if (tmode == CEED_TRANSPOSE) {
609     const CeedInt vsize = ncomp*CeedPowInt(basis->P1d, dim);
610     for (CeedInt i = 0; i < vsize; i++)
611       v[i] = (CeedScalar) 0;
612   }
613   if (emode & CEED_EVAL_INTERP) {
614     CeedInt P = basis->P1d, Q = basis->Q1d;
615     if (tmode == CEED_TRANSPOSE) {
616       P = basis->Q1d; Q = basis->P1d;
617     }
618     CeedInt pre = ncomp*CeedPowInt(P, dim-1), post = 1;
619     CeedScalar tmp[2][ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1)];
620     CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
621               ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1));
622     for (CeedInt d=0; d<dim; d++) {
623       ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, basis->interp1d,
624                                       tmode, add&&(d==dim-1),
625                                       d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
626       CeedChk(ierr);
627       pre /= P;
628       post *= Q;
629     }
630     if (tmode == CEED_NOTRANSPOSE) {
631       v += nqpt;
632     } else {
633       u += nqpt;
634     }
635   }
636   if (emode & CEED_EVAL_GRAD) {
637     CeedInt P = basis->P1d, Q = basis->Q1d;
638     // In CEED_NOTRANSPOSE mode:
639     // u is (P^dim x nc), column-major layout (nc = ncomp)
640     // v is (Q^dim x nc x dim), column-major layout (nc = ncomp)
641     // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
642     if (tmode == CEED_TRANSPOSE) {
643       P = basis->Q1d, Q = basis->P1d;
644     }
645     CeedScalar tmp[2][ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1)];
646     CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
647               ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1));
648     for (CeedInt p = 0; p < dim; p++) {
649       CeedInt pre = ncomp*CeedPowInt(P, dim-1), post = 1;
650       for (CeedInt d=0; d<dim; d++) {
651         ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q,
652                                         (p==d)?basis->grad1d:basis->interp1d,
653                                         tmode, add&&(d==dim-1),
654                                         d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
655         CeedChk(ierr);
656         pre /= P;
657         post *= Q;
658       }
659       if (tmode == CEED_NOTRANSPOSE) {
660         v += nqpt;
661       } else {
662         u += nqpt;
663       }
664     }
665   }
666   if (emode & CEED_EVAL_WEIGHT) {
667     if (tmode == CEED_TRANSPOSE)
668       return CeedError(basis->ceed, 1,
669                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
670     CeedInt Q = basis->Q1d;
671     for (CeedInt d=0; d<dim; d++) {
672       CeedInt pre = CeedPowInt(Q, dim-d-1), post = CeedPowInt(Q, d);
673       for (CeedInt i=0; i<pre; i++) {
674         for (CeedInt j=0; j<Q; j++) {
675           for (CeedInt k=0; k<post; k++) {
676             v[(i*Q + j)*post + k] = basis->qweight1d[j]
677                                     * (d == 0 ? 1 : v[(i*Q + j)*post + k]);
678           }
679         }
680       }
681     }
682   }
683   if (U) {
684     ierr = CeedVectorRestoreArrayRead(U, &u); CeedChk(ierr);
685   }
686   ierr = CeedVectorRestoreArray(V, &v); CeedChk(ierr);
687   return 0;
688 }
689 
690 static int CeedBasisDestroy_Magma(CeedBasis basis) {
691   return 0;
692 }
693 
694 static int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d,
695     CeedInt Q1d, const CeedScalar *interp1d,
696     const CeedScalar *grad1d,
697     const CeedScalar *qref1d,
698     const CeedScalar *qweight1d,
699     CeedBasis basis) {
700   basis->Apply = CeedBasisApply_Magma;
701   basis->Destroy = CeedBasisDestroy_Magma;
702   return 0;
703 }
704 
705 static int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim,
706                                    CeedInt ndof, CeedInt nqpts,
707                                    const CeedScalar *interp,
708                                    const CeedScalar *grad,
709                                    const CeedScalar *qref,
710                                    const CeedScalar *qweight,
711                                    CeedBasis basis) {
712   return CeedError(basis->ceed, 1, "Backend does not implement non-tensor bases");
713 }
714 
715 static int CeedQFunctionApply_Magma(CeedQFunction qf, CeedInt Q,
716                                     CeedVector *U, CeedVector *V) {
717   int ierr;
718   CeedQFunction_Ref *impl;
719   ierr = CeedQFunctionGetData(qf, (void *)&impl); CeedChk(ierr);
720 
721   void *ctx;
722   ierr = CeedQFunctionGetContext(qf, &ctx); CeedChk(ierr);
723 
724   int (*f)() = NULL;
725   ierr = CeedQFunctionGetUserFunction(qf, (int (* *)())&f); CeedChk(ierr);
726 
727   CeedInt nIn, nOut;
728   ierr = CeedQFunctionGetNumArgs(qf, &nIn, &nOut); CeedChk(ierr);
729 
730   for (int i = 0; i<nIn; i++) {
731     if (U[i]) {
732       ierr = CeedVectorGetArrayRead(U[i], CEED_MEM_HOST, &impl->inputs[i]);
733       CeedChk(ierr);
734     }
735   }
736   for (int i = 0; i<nOut; i++) {
737     if (U[i]) {
738       ierr = CeedVectorGetArray(V[i], CEED_MEM_HOST, &impl->outputs[i]);
739       CeedChk(ierr);
740     }
741   }
742 
743   ierr = f(ctx, Q, impl->inputs, impl->outputs); CeedChk(ierr);
744 
745   for (int i = 0; i<nIn; i++) {
746     if (U[i]) {
747       ierr = CeedVectorRestoreArrayRead(U[i], &impl->inputs[i]); CeedChk(ierr);
748     }
749   }
750   for (int i = 0; i<nOut; i++) {
751     if (U[i]) {
752       ierr = CeedVectorRestoreArray(V[i], &impl->outputs[i]); CeedChk(ierr);
753     }
754   }
755   return 0;
756 }
757 
758 static int CeedQFunctionDestroy_Magma(CeedQFunction qf) {
759   int ierr;
760   CeedQFunction_Magma *impl;
761   ierr = CeedQFunctionGetData(qf, (void *)&impl); CeedChk(ierr);
762 
763   ierr = CeedFree(&impl->inputs); CeedChk(ierr);
764   ierr = CeedFree(&impl->outputs); CeedChk(ierr);
765   ierr = CeedFree(&impl); CeedChk(ierr);
766 
767   return 0;
768 }
769 
770 static int CeedQFunctionCreate_Magma(CeedQFunction qf) {
771   int ierr;
772   Ceed ceed;
773   ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChk(ierr);
774 
775   CeedQFunction_Magma *impl;
776   ierr = CeedCalloc(1, &impl); CeedChk(ierr);
777   ierr = CeedCalloc(16, &impl->inputs); CeedChk(ierr);
778   ierr = CeedCalloc(16, &impl->outputs); CeedChk(ierr);
779   ierr = CeedQFunctionSetData(qf, (void *)&impl); CeedChk(ierr);
780 
781   qf->Apply = CeedQFunctionApply_Magma;
782   qf->Destroy = CeedQFunctionDestroy_Magma;
783   return 0;
784 }
785 
786 static int CeedOperatorDestroy_Magma(CeedOperator op) {
787   int ierr;
788   CeedOperator_Magma *impl;
789   ierr = CeedOperatorGetData(op, (void *)&impl); CeedChk(ierr);
790 
791   for (CeedInt i=0; i<impl->numein+impl->numeout; i++) {
792     if (impl->Evecs[i]) {
793       ierr = CeedVectorDestroy(&impl->Evecs[i]); CeedChk(ierr);
794     }
795   }
796   ierr = CeedFree(&impl->Evecs); CeedChk(ierr);
797   ierr = CeedFree(&impl->Edata); CeedChk(ierr);
798 
799   for (CeedInt i=0; i<impl->numein; i++) {
800     ierr = CeedVectorDestroy(&impl->evecsin[i]); CeedChk(ierr);
801     ierr = CeedVectorDestroy(&impl->qvecsin[i]); CeedChk(ierr);
802   }
803   ierr = CeedFree(&impl->evecsin); CeedChk(ierr);
804   ierr = CeedFree(&impl->qvecsin); CeedChk(ierr);
805 
806   for (CeedInt i=0; i<impl->numeout; i++) {
807     ierr = CeedVectorDestroy(&impl->evecsout[i]); CeedChk(ierr);
808     ierr = CeedVectorDestroy(&impl->qvecsout[i]); CeedChk(ierr);
809   }
810   ierr = CeedFree(&impl->evecsout); CeedChk(ierr);
811   ierr = CeedFree(&impl->qvecsout); CeedChk(ierr);
812 
813 
814   ierr = CeedFree(&impl); CeedChk(ierr);
815   return 0;
816 }
817 
818 
819 /*
820   Setup infields or outfields
821  */
822 static int CeedOperatorSetupFields_Magma(CeedQFunction qf, CeedOperator op,
823     bool inOrOut,
824     CeedVector *fullevecs, CeedVector *evecs,
825     CeedVector *qvecs, CeedInt starte,
826     CeedInt numfields, CeedInt Q) {
827   CeedInt dim = 1, ierr, ncomp;
828   Ceed ceed;
829   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
830   CeedQFunction_Magma *qf_data;
831   ierr = CeedQFunctionGetData(qf, (void *)&qf_data); CeedChk(ierr);
832   CeedBasis basis;
833   CeedElemRestriction Erestrict;
834   CeedOperatorField *opfields;
835   CeedQFunctionField *qffields;
836   if (inOrOut) {
837     ierr = CeedOperatorGetFields(op, NULL, &opfields);
838     CeedChk(ierr);
839     ierr = CeedQFunctionGetFields(qf, NULL, &qffields);
840     CeedChk(ierr);
841   } else {
842     ierr = CeedOperatorGetFields(op, &opfields, NULL);
843     CeedChk(ierr);
844     ierr = CeedQFunctionGetFields(qf, &qffields, NULL);
845     CeedChk(ierr);
846   }
847 
848   // Loop over fields
849   for (CeedInt i=0; i<numfields; i++) {
850     CeedEvalMode emode;
851     ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChk(ierr);
852     if (emode != CEED_EVAL_WEIGHT) {
853       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict);
854       CeedChk(ierr);
855       ierr = CeedElemRestrictionCreateVector(Erestrict, NULL, &fullevecs[i+starte]);
856       CeedChk(ierr);
857     } else {
858     }
859     switch(emode) {
860     case CEED_EVAL_NONE:
861       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
862       CeedChk(ierr);
863       ierr = CeedVectorCreate(ceed, Q*ncomp, &evecs[i]); CeedChk(ierr);
864       ierr = CeedVectorCreate(ceed, Q*ncomp, &qvecs[i]); CeedChk(ierr);
865       break; // No action
866     case CEED_EVAL_INTERP:
867       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChk(ierr);
868       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
869       CeedChk(ierr);
870       ierr = CeedVectorCreate(ceed, Q*ncomp, &evecs[i]); CeedChk(ierr);
871       ierr = CeedVectorCreate(ceed, Q*ncomp, &qvecs[i]); CeedChk(ierr);
872       break;
873     case CEED_EVAL_GRAD:
874       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChk(ierr);
875       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
876       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
877       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
878       CeedChk(ierr);
879       ierr = CeedVectorCreate(ceed, Q*ncomp, &evecs[i]); CeedChk(ierr);
880       ierr = CeedVectorCreate(ceed, Q*ncomp*dim, &qvecs[i]); CeedChk(ierr);
881       break;
882     case CEED_EVAL_WEIGHT: // Only on input fields
883       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChk(ierr);
884       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
885       ierr = CeedVectorCreate(ceed, Q, &qvecs[i]); CeedChk(ierr);
886       ierr = CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT,
887                             NULL, qvecs[i]); CeedChk(ierr);
888       assert(starte==0);
889       break;
890     case CEED_EVAL_DIV: break; // Not implemented
891     case CEED_EVAL_CURL: break; // Not implemented
892     }
893   }
894   return 0;
895 }
896 
897 /*
898   CeedOperator needs to connect all the named fields (be they active or passive)
899   to the named inputs and outputs of its CeedQFunction.
900  */
901 static int CeedOperatorSetup_Magma(CeedOperator op) {
902   int ierr;
903   bool setupdone;
904   ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
905   if (setupdone) return 0;
906   Ceed ceed;
907   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
908   CeedOperator_Magma *data;
909   ierr = CeedOperatorGetData(op, (void *)&data); CeedChk(ierr);
910   CeedQFunction qf;
911   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
912   CeedInt Q, numinputfields, numoutputfields;
913   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
914   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
915   CeedOperatorField *opinputfields, *opoutputfields;
916   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
917   CeedChk(ierr);
918   CeedQFunctionField *qfinputfields, *qfoutputfields;
919   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
920   CeedChk(ierr);
921 
922   data->numein = numinputfields;
923   data->numeout = numoutputfields;
924 
925   // Allocate
926   const CeedInt numIO = numinputfields + numoutputfields;
927 
928   ierr = CeedCalloc(numinputfields + numoutputfields, &data->Evecs);
929   CeedChk(ierr);
930   ierr = CeedCalloc(numinputfields + numoutputfields, &data->Edata);
931   CeedChk(ierr);
932 
933   ierr = CeedCalloc(16, &data->evecsin); CeedChk(ierr);
934   ierr = CeedCalloc(16, &data->evecsout); CeedChk(ierr);
935   ierr = CeedCalloc(16, &data->qvecsin); CeedChk(ierr);
936   ierr = CeedCalloc(16, &data->qvecsout); CeedChk(ierr);
937 
938   // Set up infield and outfield pointer arrays
939   // Infields
940   ierr = CeedOperatorSetupFields_Magma(qf, op, 0, data->Evecs,
941                                        data->evecsin, data->qvecsin, 0,
942                                        numinputfields, Q);
943   CeedChk(ierr);
944   // Outfields
945   ierr = CeedOperatorSetupFields_Magma(qf, op, 1, data->Evecs,
946                                        data->evecsout, data->qvecsout,
947                                        numinputfields, numoutputfields, Q);
948   CeedChk(ierr);
949   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
950   return 0;
951 }
952 
953 static int CeedOperatorApply_Magma(CeedOperator op, CeedVector invec,
954                                    CeedVector outvec, CeedRequest *request) {
955   int ierr;
956   Ceed ceed;
957   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
958   CeedOperator_Magma *data;
959   ierr = CeedOperatorGetData(op, (void *)&data); CeedChk(ierr);
960   //CeedVector *E = data->Evecs, *D = data->D, outvec;
961   CeedInt Q, elemsize, numelements, numinputfields, numoutputfields, ncomp;
962   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
963   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
964   CeedQFunction qf;
965   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
966   CeedTransposeMode lmode;
967   CeedOperatorField *opinputfields, *opoutputfields;
968   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
969   CeedChk(ierr);
970   CeedQFunctionField *qfinputfields, *qfoutputfields;
971   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
972   CeedChk(ierr);
973   CeedEvalMode emode;
974   CeedVector vec;
975   CeedBasis basis;
976   CeedElemRestriction Erestrict;
977 
978   ierr = CeedOperatorSetup_Magma(op); CeedChk(ierr);
979   ierr= CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
980   CeedChk(ierr);
981 
982   // Input Evecs and Restriction
983   for (CeedInt i=0; i<numinputfields; i++) {
984     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
985     CeedChk(ierr);
986     if (emode & CEED_EVAL_WEIGHT) {
987     } else { // Restriction
988       // Get input vector
989       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr);
990       if (vec == CEED_VECTOR_ACTIVE)
991         vec = invec;
992       // Restrict
993       ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
994       CeedChk(ierr);
995       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
996       ierr = CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE,
997                                       lmode, vec, data->Evecs[i],
998                                       request); CeedChk(ierr);
999       // Get evec
1000       ierr = CeedVectorGetArrayRead(data->Evecs[i], CEED_MEM_HOST,
1001                                     (const CeedScalar **) &data->Edata[i]);
1002       CeedChk(ierr);
1003     }
1004   }
1005 
1006   // Output Evecs
1007   for (CeedInt i=0; i<numoutputfields; i++) {
1008     ierr = CeedVectorGetArray(data->Evecs[i+data->numein], CEED_MEM_HOST,
1009                               &data->Edata[i + numinputfields]); CeedChk(ierr);
1010   }
1011 
1012   // Loop through elements
1013   for (CeedInt e=0; e<numelements; e++) {
1014     // Input basis apply if needed
1015     for (CeedInt i=0; i<numinputfields; i++) {
1016       // Get elemsize, emode, ncomp
1017       ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1018       CeedChk(ierr);
1019       ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1020       CeedChk(ierr);
1021       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1022       CeedChk(ierr);
1023       ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp);
1024       CeedChk(ierr);
1025       // Basis action
1026       switch(emode) {
1027       case CEED_EVAL_NONE:
1028         ierr = CeedVectorSetArray(data->qvecsin[i], CEED_MEM_HOST,
1029                                   CEED_USE_POINTER,
1030                                   &data->Edata[i][e*Q*ncomp]); CeedChk(ierr);
1031         break;
1032       case CEED_EVAL_INTERP:
1033         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1034         ierr = CeedVectorSetArray(data->evecsin[i], CEED_MEM_HOST,
1035                                   CEED_USE_POINTER,
1036                                   &data->Edata[i][e*elemsize*ncomp]);
1037         CeedChk(ierr);
1038         ierr = CeedBasisApply(basis, 1, CEED_NOTRANSPOSE,
1039                               CEED_EVAL_INTERP, data->evecsin[i],
1040                               data->qvecsin[i]); CeedChk(ierr);
1041         break;
1042       case CEED_EVAL_GRAD:
1043         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1044         ierr = CeedVectorSetArray(data->evecsin[i], CEED_MEM_HOST,
1045                                   CEED_USE_POINTER,
1046                                   &data->Edata[i][e*elemsize*ncomp]);
1047         CeedChk(ierr);
1048         ierr = CeedBasisApply(basis, 1, CEED_NOTRANSPOSE,
1049                               CEED_EVAL_GRAD, data->evecsin[i],
1050                               data->qvecsin[i]); CeedChk(ierr);
1051         break;
1052       case CEED_EVAL_WEIGHT:
1053         break;  // No action
1054       case CEED_EVAL_DIV:
1055         break; // Not implemented
1056       case CEED_EVAL_CURL:
1057         break; // Not implemented
1058       }
1059     }
1060     // Output pointers
1061     for (CeedInt i=0; i<numoutputfields; i++) {
1062       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1063       CeedChk(ierr);
1064       if (emode == CEED_EVAL_NONE) {
1065         ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
1066         CeedChk(ierr);
1067         ierr = CeedVectorSetArray(data->qvecsout[i], CEED_MEM_HOST,
1068                                   CEED_USE_POINTER,
1069                                   &data->Edata[i + numinputfields][e*Q*ncomp]);
1070         CeedChk(ierr);
1071       }
1072       if (emode == CEED_EVAL_INTERP) {
1073       }
1074       if (emode == CEED_EVAL_GRAD) {
1075       }
1076       if (emode == CEED_EVAL_WEIGHT) {
1077       }
1078     }
1079 
1080     // Q function
1081     ierr = CeedQFunctionApply(qf, Q, data->qvecsin, data->qvecsout); CeedChk(ierr);
1082 
1083     // Output basis apply if needed
1084     for (CeedInt i=0; i<numoutputfields; i++) {
1085       // Get elemsize, emode, ncomp
1086       ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1087       CeedChk(ierr);
1088       ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1089       CeedChk(ierr);
1090       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1091       CeedChk(ierr);
1092       ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
1093       CeedChk(ierr);
1094       // Basis action
1095       switch(emode) {
1096       case CEED_EVAL_NONE:
1097         break; // No action
1098       case CEED_EVAL_INTERP:
1099         ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
1100         CeedChk(ierr);
1101         ierr = CeedVectorSetArray(data->evecsout[i], CEED_MEM_HOST,
1102                                   CEED_USE_POINTER,
1103                                   &data->Edata[i + numinputfields][e*elemsize*ncomp]);
1104         ierr = CeedBasisApply(basis, 1, CEED_TRANSPOSE,
1105                               CEED_EVAL_INTERP, data->qvecsout[i],
1106                               data->evecsout[i]); CeedChk(ierr);
1107         break;
1108       case CEED_EVAL_GRAD:
1109         ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
1110         CeedChk(ierr);
1111         ierr = CeedVectorSetArray(data->evecsout[i], CEED_MEM_HOST,
1112                                   CEED_USE_POINTER,
1113                                   &data->Edata[i + numinputfields][e*elemsize*ncomp]);
1114         ierr = CeedBasisApply(basis, 1, CEED_TRANSPOSE,
1115                               CEED_EVAL_GRAD, data->qvecsout[i],
1116                               data->evecsout[i]); CeedChk(ierr);
1117         break;
1118       case CEED_EVAL_WEIGHT: break; // Should not occur
1119       case CEED_EVAL_DIV: break; // Not implemented
1120       case CEED_EVAL_CURL: break; // Not implemented
1121       }
1122     }
1123   } // numelements
1124 
1125   // Zero lvecs
1126   for (CeedInt i=0; i<numoutputfields; i++) {
1127     ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr);
1128     if (vec == CEED_VECTOR_ACTIVE)
1129       vec = outvec;
1130     ierr = CeedVectorSetValue(vec, 0.0); CeedChk(ierr);
1131   }
1132 
1133   // Output restriction
1134   for (CeedInt i=0; i<numoutputfields; i++) {
1135     // Restore evec
1136     ierr = CeedVectorRestoreArray(data->Evecs[i+data->numein],
1137                                   &data->Edata[i + numinputfields]);
1138     CeedChk(ierr);
1139     // Get output vector
1140     ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr);
1141     // Active
1142     if (vec == CEED_VECTOR_ACTIVE)
1143       vec = outvec;
1144     // Restrict
1145     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1146     CeedChk(ierr);
1147     ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
1148     ierr = CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE,
1149                                     lmode, data->Evecs[i+data->numein], vec,
1150                                     request); CeedChk(ierr);
1151   }
1152 
1153   // Restore input arrays
1154   for (CeedInt i=0; i<numinputfields; i++) {
1155     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1156     CeedChk(ierr);
1157     if (emode & CEED_EVAL_WEIGHT) {
1158     } else {
1159       // Restriction
1160       ierr = CeedVectorRestoreArrayRead(data->Evecs[i],
1161                                         (const CeedScalar **) &data->Edata[i]);
1162       CeedChk(ierr);
1163     }
1164   }
1165   return 0;
1166 }
1167 
1168 static int CeedOperatorCreate_Magma(CeedOperator op) {
1169   int ierr;
1170   CeedOperator_Magma *impl;
1171   Ceed ceed;
1172   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1173 
1174   ierr = CeedCalloc(1, &impl); CeedChk(ierr);
1175   ierr = CeedOperatorSetData(op, (void *)&impl);
1176 
1177   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Apply",
1178                                 CeedOperatorApply_Magma); CeedChk(ierr);
1179   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy",
1180                                 CeedOperatorDestroy_Magma); CeedChk(ierr);
1181   return 0;
1182 }
1183 
1184 int CeedCompositeOperatorCreate_Magma(CeedOperator op) {
1185   int ierr;
1186   Ceed ceed;
1187   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1188   return CeedError(ceed, 1, "Backend does not support composite operators");
1189 }
1190 
1191 // *****************************************************************************
1192 // * INIT
1193 // *****************************************************************************
1194 static int CeedInit_Magma(const char *resource, Ceed ceed) {
1195   int ierr;
1196   if (strcmp(resource, "/gpu/magma"))
1197     return CeedError(ceed, 1, "MAGMA backend cannot use resource: %s", resource);
1198 
1199   ierr = magma_init();
1200   if (ierr) return CeedError(ceed, 1, "error in magma_init(): %d\n", ierr);
1201   //magma_print_environment();
1202 
1203   ceed->VectorCreate = CeedVectorCreate_Magma;
1204   ceed->BasisCreateTensorH1 = CeedBasisCreateTensorH1_Magma;
1205   ceed->BasisCreateH1 = CeedBasisCreateH1_Magma;
1206   ceed->ElemRestrictionCreate = CeedElemRestrictionCreate_Magma;
1207   ceed->ElemRestrictionCreateBlocked = CeedElemRestrictionCreateBlocked_Magma;
1208   ceed->QFunctionCreate = CeedQFunctionCreate_Magma;
1209   ceed->OperatorCreate = CeedOperatorCreate_Magma;
1210   ceed->CompositeOperatorCreate = CeedCompositeOperatorCreate_Magma;
1211   return 0;
1212 }
1213 
1214 // *****************************************************************************
1215 // * REGISTER
1216 // *****************************************************************************
1217 __attribute__((constructor))
1218 static void Register(void) {
1219   CeedRegister("/gpu/magma", CeedInit_Magma, 20);
1220 }
1221