xref: /libCEED/backends/magma/ceed-magma.c (revision c71e1dcdcfce633cdb19fa81aa6735b006eb809d)
1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
3 // reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include "ceed-magma.h"
18 #include <string.h>
19 
20 typedef struct {
21   CeedScalar *array;
22   CeedScalar *darray;
23   int  own_;
24   int down_;
25 } CeedVector_Magma;
26 
27 typedef struct {
28   CeedInt *indices;
29   CeedInt *dindices;
30   int  own_;
31   int down_;            // cover a case where we own Device memory
32 } CeedElemRestriction_Magma;
33 
34 typedef struct {
35   const CeedScalar **inputs;
36   CeedScalar **outputs;
37 } CeedQFunction_Magma;
38 
39 typedef struct {
40   CeedVector *Evecs; /// E-vectors needed to apply operator (in followed by out)
41   CeedScalar **Edata;
42   CeedVector *evecsin;   /// Input E-vectors needed to apply operator
43   CeedVector *evecsout;   /// Output E-vectors needed to apply operator
44   CeedVector *qvecsin;   /// Input Q-vectors needed to apply operator
45   CeedVector *qvecsout;   /// Output Q-vectors needed to apply operator
46   CeedInt    numein;
47   CeedInt    numeout;
48 } CeedOperator_Magma;
49 
50 // *****************************************************************************
51 // * Initialize vector vec (after free mem) with values from array based on cmode
52 // *   CEED_COPY_VALUES: memory is allocated in vec->array_allocated, made equal
53 // *                     to array, and data is copied (not store passed pointer)
54 // *   CEED_OWN_POINTER: vec->data->array_allocated and vec->data->array = array
55 // *   CEED_USE_POINTER: vec->data->array = array (can modify; no ownership)
56 // * mtype: CEED_MEM_HOST or CEED_MEM_DEVICE
57 // *****************************************************************************
58 static int CeedVectorSetArray_Magma(CeedVector vec, CeedMemType mtype,
59                                     CeedCopyMode cmode, CeedScalar *array) {
60   CeedVector_Magma *impl = vec->data;
61   int ierr;
62 
63   // If own data, free the "old" data, e.g., as it may be of different size
64   if (impl->own_) {
65     magma_free( impl->darray );
66     magma_free_pinned( impl->array );
67     impl->darray = NULL;
68     impl->array  = NULL;
69     impl->own_ = 0;
70     impl->down_= 0;
71   }
72 
73   if (mtype == CEED_MEM_HOST) {
74     // memory is on the host; own_ = 0
75     switch (cmode) {
76     case CEED_COPY_VALUES:
77       ierr = magma_malloc( (void **)&impl->darray,
78                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
79       ierr = magma_malloc_pinned( (void **)&impl->array,
80                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
81       impl->own_ = 1;
82 
83       if (array != NULL)
84         magma_setvector(vec->length, sizeof(array[0]),
85                         array, 1, impl->darray, 1);
86       break;
87     case CEED_OWN_POINTER:
88       ierr = magma_malloc( (void **)&impl->darray,
89                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
90       // TODO: possible problem here is if we are passed non-pinned memory;
91       //       (as we own it, lter in destroy, we use free for pinned memory).
92       impl->array = array;
93       impl->own_ = 1;
94 
95       if (array != NULL)
96         magma_setvector(vec->length, sizeof(array[0]),
97                         array, 1, impl->darray, 1);
98       break;
99     case CEED_USE_POINTER:
100       ierr = magma_malloc( (void **)&impl->darray,
101                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
102       magma_setvector(vec->length, sizeof(array[0]),
103                       array, 1, impl->darray, 1);
104 
105       impl->down_  = 1;
106       impl->array  = array;
107     }
108   } else if (mtype == CEED_MEM_DEVICE) {
109     // memory is on the device; own = 0
110     switch (cmode) {
111     case CEED_COPY_VALUES:
112       ierr = magma_malloc( (void **)&impl->darray,
113                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
114       ierr = magma_malloc_pinned( (void **)&impl->array,
115                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
116       impl->own_ = 1;
117 
118       if (array)
119         magma_copyvector(vec->length, sizeof(array[0]),
120                          array, 1, impl->darray, 1);
121       else
122         // t30 assumes allocation initializes with 0s
123         magma_setvector(vec->length, sizeof(array[0]),
124                         impl->array, 1, impl->darray, 1);
125       break;
126     case CEED_OWN_POINTER:
127       impl->darray = array;
128       ierr = magma_malloc_pinned( (void **)&impl->array,
129                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
130       impl->own_ = 1;
131 
132       break;
133     case CEED_USE_POINTER:
134       impl->darray = array;
135       impl->array  = NULL;
136     }
137 
138   } else
139     return CeedError(vec->ceed, 1, "Only MemType = HOST or DEVICE supported");
140 
141   return 0;
142 }
143 
144 // *****************************************************************************
145 // * Give data pointer from vector vec to array (on HOST or DEVICE)
146 // *****************************************************************************
147 static int CeedVectorGetArray_Magma(CeedVector vec, CeedMemType mtype,
148                                     CeedScalar **array) {
149   CeedVector_Magma *impl = vec->data;
150   int ierr;
151 
152   if (mtype == CEED_MEM_HOST) {
153     if (impl->own_) {
154       // data is owned so GPU had the most up-to-date version; copy it
155       // TTT - apparantly it doesn't have most up to date data
156       magma_getvector(vec->length, sizeof(*array[0]),
157                       impl->darray, 1, impl->array, 1);
158       CeedDebug("\033[31m[CeedVectorGetArray_Magma]");
159       //fprintf(stderr,"rrrrrrrrrrrrrrr\n");
160     } else if (impl->array == NULL) {
161       // Vector doesn't own the data and was set on GPU
162       if (impl->darray == NULL) {
163         // call was made just to allocate memory
164         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
165         CeedChk(ierr);
166       } else
167         return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
168     }
169     *array = impl->array;
170   } else if (mtype == CEED_MEM_DEVICE) {
171     if (impl->darray == NULL) {
172       // Vector doesn't own the data and was set on the CPU
173       if (impl->array == NULL) {
174         // call was made just to allocate memory
175         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
176         CeedChk(ierr);
177       } else
178         return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
179     }
180     *array = impl->darray;
181   } else
182     return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
183 
184   return 0;
185 }
186 
187 // *****************************************************************************
188 // * Give data pointer from vector vec to array (on HOST or DEVICE) to read it
189 // *****************************************************************************
190 static int CeedVectorGetArrayRead_Magma(CeedVector vec, CeedMemType mtype,
191                                         const CeedScalar **array) {
192   CeedVector_Magma *impl = vec->data;
193   int ierr;
194 
195   if (mtype == CEED_MEM_HOST) {
196     if (impl->own_) {
197       // data is owned so GPU had the most up-to-date version; copy it
198       magma_getvector(vec->length, sizeof(*array[0]),
199                       impl->darray, 1, impl->array, 1);
200     } else if (impl->array == NULL) {
201       // Vector doesn't own the data and was set on GPU
202       if (impl->darray == NULL) {
203         // call was made just to allocate memory
204         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
205         CeedChk(ierr);
206       } else
207         return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
208     }
209     *array = impl->array;
210   } else if (mtype == CEED_MEM_DEVICE) {
211     if (impl->darray == NULL) {
212       // Vector doesn't own the data and was set on the CPU
213       if (impl->array == NULL) {
214         // call was made just to allocate memory
215         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
216         CeedChk(ierr);
217       } else
218         return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
219     }
220     *array = impl->darray;
221   } else
222     return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
223 
224   return 0;
225 }
226 
227 // *****************************************************************************
228 // * There is no mtype here for array so it is not clear if we restore from HOST
229 // * memory or from DEVICE memory. We assume that it is CPU memory because if
230 // * it was GPU memory we would not call this routine at all.
231 // * Restore vector vec with values from array, where array received its values
232 // * from vec and possibly modified them.
233 // *****************************************************************************
234 static int CeedVectorRestoreArray_Magma(CeedVector vec) {
235   CeedVector_Magma *impl = vec->data;
236 
237   if (impl->down_) {
238     // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
239     magma_getvector(vec->length, sizeof(*array[0]),
240                     impl->darray, 1, impl->array, 1);
241   }
242   return 0;
243 }
244 
245 // *****************************************************************************
246 // * There is no mtype here for array so it is not clear if we restore from HOST
247 // * memory or from DEVICE memory. We assume that it is CPU memory because if
248 // * it was GPU memory we would not call this routine at all.
249 // * Restore vector vec with values from array, where array received its values
250 // * from vec to only read them; in this case vec may have been modified meanwhile
251 // * and needs to be restored here.
252 // *****************************************************************************
253 static int CeedVectorRestoreArrayRead_Magma(CeedVector vec) {
254   CeedVector_Magma *impl = vec->data;
255 
256   if (impl->down_) {
257     // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
258     magma_getvector(vec->length, sizeof(*array[0]),
259                     impl->darray, 1, impl->array, 1);
260   }
261   return 0;
262 }
263 
264 static int CeedVectorDestroy_Magma(CeedVector vec) {
265   CeedVector_Magma *impl = vec->data;
266   int ierr;
267 
268   // Free if we own the data
269   if (impl->own_) {
270     ierr = magma_free_pinned(impl->array); CeedChk(ierr);
271     ierr = magma_free(impl->darray);       CeedChk(ierr);
272   } else if (impl->down_) {
273     ierr = magma_free(impl->darray);       CeedChk(ierr);
274   }
275   ierr = CeedFree(&vec->data); CeedChk(ierr);
276   return 0;
277 }
278 
279 // *****************************************************************************
280 // * Create vector vec of size n
281 // *****************************************************************************
282 static int CeedVectorCreate_Magma(CeedInt n, CeedVector vec) {
283   CeedVector_Magma *impl;
284   int ierr;
285 
286   vec->SetArray = CeedVectorSetArray_Magma;
287   vec->GetArray = CeedVectorGetArray_Magma;
288   vec->GetArrayRead = CeedVectorGetArrayRead_Magma;
289   vec->RestoreArray = CeedVectorRestoreArray_Magma;
290   vec->RestoreArrayRead = CeedVectorRestoreArrayRead_Magma;
291   vec->Destroy = CeedVectorDestroy_Magma;
292   ierr = CeedCalloc(1,&impl); CeedChk(ierr);
293   impl->darray = NULL;
294   impl->array  = NULL;
295   impl->own_ = 0;
296   impl->down_= 0;
297   vec->data = impl;
298   return 0;
299 }
300 
301 
302 // *****************************************************************************
303 // * Apply restriction operator r to u: v = r(rmode) u
304 // *****************************************************************************
305 static int CeedElemRestrictionApply_Magma(CeedElemRestriction r,
306     CeedTransposeMode tmode,
307     CeedTransposeMode lmode, CeedVector u,
308     CeedVector v, CeedRequest *request) {
309   CeedElemRestriction_Magma *impl = r->data;
310   int ierr;
311   const CeedScalar *uu;
312   CeedScalar *vv;
313   CeedInt nelem = r->nelem, elemsize = r->elemsize, ndof = r->ndof,
314           ncomp=r->ncomp;
315   CeedInt esize = nelem * elemsize;
316 
317   #ifdef USE_MAGMA_BATCH2
318   CeedInt *dindices = impl->dindices;
319   // Get pointers on the device
320   ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &uu); CeedChk(ierr);
321   ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &vv); CeedChk(ierr);
322   #else
323   CeedInt *indices = impl->indices;
324   ierr = CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu); CeedChk(ierr);
325   ierr = CeedVectorGetArray(v, CEED_MEM_HOST, &vv); CeedChk(ierr);
326   #endif
327 
328   if (tmode == CEED_NOTRANSPOSE) {
329     // Perform: v = r * u
330     if (!impl->indices) {
331       for (CeedInt i=0; i<esize*ncomp; i++) vv[i] = uu[i];
332     } else if (ncomp == 1) {
333       #ifdef USE_MAGMA_BATCH2
334 magma_template<<i=0:esize>>
335       (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
336         vv[i] = uu[dindices[i]];
337       }
338       #else
339       for (CeedInt i=0; i<esize; i++) vv[i] = uu[indices[i]];
340       #endif
341     } else {
342       // vv is (elemsize x ncomp x nelem), column-major
343       if (lmode == CEED_NOTRANSPOSE) { // u is (ndof x ncomp), column-major
344         #ifdef USE_MAGMA_BATCH2
345 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
346         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, int ndof) {
347           vv[i + iend*(d+dend*e)] = uu[dindices[i+iend*e]+ndof*d];
348         }
349         #else
350         for (CeedInt e = 0; e < nelem; e++)
351           for (CeedInt d = 0; d < ncomp; d++)
352             for (CeedInt i=0; i < elemsize; i++) {
353               vv[i + elemsize*(d+ncomp*e)] =
354                 uu[indices[i+elemsize*e]+ndof*d];
355             }
356         #endif
357       } else { // u is (ncomp x ndof), column-major
358         #ifdef USE_MAGMA_BATCH2
359 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
360         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
361           vv[i + iend*(d+dend*e)] = uu[d+dend*dindices[i + iend*e]];
362         }
363         #else
364         for (CeedInt e = 0; e < nelem; e++)
365           for (CeedInt d = 0; d < ncomp; d++)
366             for (CeedInt i=0; i< elemsize; i++) {
367               vv[i + elemsize*(d+ncomp*e)] =
368                 uu[d+ncomp*indices[i+elemsize*e]];
369             }
370         #endif
371       }
372     }
373   } else {
374     // Note: in transpose mode, we perform: v += r^t * u
375     if (!impl->indices) {
376       for (CeedInt i=0; i<esize; i++) vv[i] += uu[i];
377     } else if (ncomp == 1) {
378       // fprintf(stderr,"3 ---------\n");
379       #ifdef USE_MAGMA_BATCH2
380 magma_template<<i=0:esize>>
381       (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
382         magmablas_datomic_add( &vv[dindices[i]], uu[i]);
383       }
384       #else
385       for (CeedInt i=0; i<esize; i++) vv[indices[i]] += uu[i];
386       #endif
387     } else { // u is (elemsize x ncomp x nelem)
388       fprintf(stderr,"2 ---------\n");
389 
390       if (lmode == CEED_NOTRANSPOSE) { // vv is (ndof x ncomp), column-major
391         #ifdef USE_MAGMA_BATCH2
392 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
393         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, CeedInt ndof) {
394           magmablas_datomic_add( &vv[dindices[i+iend*e]+ndof*d],
395                                  uu[i+iend*(d+e*dend)]);
396         }
397         #else
398         for (CeedInt e = 0; e < nelem; e++)
399           for (CeedInt d = 0; d < ncomp; d++)
400             for (CeedInt i=0; i < elemsize; i++) {
401               vv[indices[i + elemsize*e]+ndof*d] +=
402                 uu[i + elemsize*(d+e*ncomp)];
403             }
404         #endif
405       } else { // vv is (ncomp x ndof), column-major
406         #ifdef USE_MAGMA_BATCH2
407 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
408         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
409           magmablas_datomic_add( &vv[d+dend*dindices[i + iend*e]],
410                                  uu[i+iend*(d+e*dend)]);
411         }
412         #else
413         for (CeedInt e = 0; e < nelem; e++)
414           for (CeedInt d = 0; d < ncomp; d++)
415             for (CeedInt i=0; i < elemsize; i++) {
416               vv[d+ncomp*indices[i + elemsize*e]] +=
417                 uu[i + elemsize*(d+e*ncomp)];
418             }
419         #endif
420       }
421     }
422   }
423 
424   ierr = CeedVectorRestoreArrayRead(u, &uu); CeedChk(ierr);
425   ierr = CeedVectorRestoreArray(v, &vv); CeedChk(ierr);
426 
427   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
428     *request = NULL;
429   return 0;
430 }
431 
432 static int CeedElemRestrictionDestroy_Magma(CeedElemRestriction r) {
433   CeedElemRestriction_Magma *impl = r->data;
434   int ierr;
435 
436   // Free if we own the data
437   if (impl->own_) {
438     ierr = magma_free_pinned(impl->indices); CeedChk(ierr);
439     ierr = magma_free(impl->dindices);       CeedChk(ierr);
440   } else if (impl->down_) {
441     ierr = magma_free(impl->dindices);       CeedChk(ierr);
442   }
443   ierr = CeedFree(&r->data); CeedChk(ierr);
444   return 0;
445 }
446 
447 static int CeedElemRestrictionCreate_Magma(CeedMemType mtype,
448     CeedCopyMode cmode,
449     const CeedInt *indices, CeedElemRestriction r) {
450   int ierr, size = r->nelem*r->elemsize;
451   CeedElemRestriction_Magma *impl;
452 
453   // Allocate memory for the MAGMA Restricton and initializa pointers to NULL
454   ierr = CeedCalloc(1,&impl); CeedChk(ierr);
455   impl->dindices = NULL;
456   impl->indices  = NULL;
457   impl->own_ = 0;
458   impl->down_= 0;
459 
460   if (mtype == CEED_MEM_HOST) {
461     // memory is on the host; own_ = 0
462     switch (cmode) {
463     case CEED_COPY_VALUES:
464       ierr = magma_malloc( (void **)&impl->dindices,
465                            size * sizeof(CeedInt)); CeedChk(ierr);
466       ierr = magma_malloc_pinned( (void **)&impl->indices,
467                                   size * sizeof(CeedInt)); CeedChk(ierr);
468       impl->own_ = 1;
469 
470       if (indices != NULL) {
471         memcpy(impl->indices, indices, size * sizeof(indices[0]));
472         magma_setvector(size, sizeof(CeedInt),
473                         impl->indices, 1, impl->dindices, 1);
474       }
475       break;
476     case CEED_OWN_POINTER:
477       ierr = magma_malloc( (void **)&impl->dindices,
478                            size * sizeof(CeedInt)); CeedChk(ierr);
479       // TODO: possible problem here is if we are passed non-pinned memory;
480       //       (as we own it, lter in destroy, we use free for pinned memory).
481       impl->indices = (CeedInt *)indices;
482       impl->own_ = 1;
483 
484       if (indices != NULL)
485         magma_setvector(size, sizeof(CeedInt),
486                         indices, 1, impl->dindices, 1);
487       break;
488     case CEED_USE_POINTER:
489       ierr = magma_malloc( (void **)&impl->dindices,
490                            size * sizeof(CeedInt)); CeedChk(ierr);
491       magma_setvector(size, sizeof(CeedInt),
492                       indices, 1, impl->dindices, 1);
493       impl->down_ = 1;
494       impl->indices  = (CeedInt *)indices;
495     }
496   } else if (mtype == CEED_MEM_DEVICE) {
497     // memory is on the device; own = 0
498     switch (cmode) {
499     case CEED_COPY_VALUES:
500       ierr = magma_malloc( (void **)&impl->dindices,
501                            size * sizeof(CeedInt)); CeedChk(ierr);
502       ierr = magma_malloc_pinned( (void **)&impl->indices,
503                                   size * sizeof(CeedInt)); CeedChk(ierr);
504       impl->own_ = 1;
505 
506       if (indices)
507         magma_copyvector(size, sizeof(CeedInt),
508                          indices, 1, impl->dindices, 1);
509       break;
510     case CEED_OWN_POINTER:
511       impl->dindices = (CeedInt *)indices;
512       ierr = magma_malloc_pinned( (void **)&impl->indices,
513                                   size * sizeof(CeedInt)); CeedChk(ierr);
514       impl->own_ = 1;
515 
516       break;
517     case CEED_USE_POINTER:
518       impl->dindices = (CeedInt *)indices;
519       impl->indices  = NULL;
520     }
521 
522   } else
523     return CeedError(r->ceed, 1, "Only MemType = HOST or DEVICE supported");
524 
525   r->data    = impl;
526   r->Apply   = CeedElemRestrictionApply_Magma;
527   r->Destroy = CeedElemRestrictionDestroy_Magma;
528 
529   return 0;
530 }
531 
532 static int CeedElemRestrictionCreateBlocked_Magma(CeedMemType mtype,
533     CeedCopyMode cmode,
534     const CeedInt *indices, CeedElemRestriction r) {
535   return CeedError(r->ceed, 1, "Backend does not implement blocked restrictions");
536 }
537 
538 // Contracts on the middle index
539 // NOTRANSPOSE: V_ajc = T_jb U_abc
540 // TRANSPOSE:   V_ajc = T_bj U_abc
541 // If Add != 0, "=" is replaced by "+="
542 static int CeedTensorContract_Magma(Ceed ceed,
543                                     CeedInt A, CeedInt B, CeedInt C, CeedInt J,
544                                     const CeedScalar *t, CeedTransposeMode tmode,
545                                     const CeedInt Add,
546                                     const CeedScalar *u, CeedScalar *v) {
547   #ifdef USE_MAGMA_BATCH
548   magma_dtensor_contract(ceed, A, B, C, J, t, tmode, Add, u, v);
549   #else
550   CeedInt tstride0 = B, tstride1 = 1;
551   if (tmode == CEED_TRANSPOSE) {
552     tstride0 = 1; tstride1 = J;
553   }
554   CeedDebug("\033[31m[CeedTensorContract] A=%d, J=%d, C=%d, B=%d: %d %d %d",
555             A,J,C,B,A*J*B*C, C*J*A, C*B*A);
556   for (CeedInt a=0; a<A; a++) {
557     for (CeedInt j=0; j<J; j++) {
558       if (!Add) {
559         for (CeedInt c=0; c<C; c++)
560           v[(a*J+j)*C+c] = 0;
561       }
562       for (CeedInt b=0; b<B; b++) {
563         for (CeedInt c=0; c<C; c++) {
564           v[(a*J+j)*C+c] += t[j*tstride0 + b*tstride1] * u[(a*B+b)*C+c];
565         }
566       }
567     }
568   }
569   #endif
570   return 0;
571 }
572 
573 static int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem,
574                                 CeedTransposeMode tmode, CeedEvalMode emode,
575                                 CeedVector U, CeedVector V) {
576   int ierr;
577   const CeedInt dim = basis->dim;
578   const CeedInt ncomp = basis->ncomp;
579   const CeedInt nqpt = ncomp*CeedPowInt(basis->Q1d, dim);
580   const CeedInt add = (tmode == CEED_TRANSPOSE);
581   const CeedScalar *u;
582   CeedScalar *v;
583   if (U) {
584     ierr = CeedVectorGetArrayRead(U, CEED_MEM_HOST, &u); CeedChk(ierr);
585   } else if (emode != CEED_EVAL_WEIGHT) {
586     return CeedError(ceed, 1,
587                      "An input vector is required for this CeedEvalMode");
588   }
589   ierr = CeedVectorGetArray(V, CEED_MEM_HOST, &v); CeedChk(ierr);
590 
591   if (nelem != 1)
592     return CeedError(basis->ceed, 1,
593                      "This backend does not support BasisApply for multiple elements");
594 
595   CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d",
596             ncomp*CeedPowInt(basis->P1d, dim));
597 
598   if (tmode == CEED_TRANSPOSE) {
599     const CeedInt vsize = ncomp*CeedPowInt(basis->P1d, dim);
600     for (CeedInt i = 0; i < vsize; i++)
601       v[i] = (CeedScalar) 0;
602   }
603   if (emode & CEED_EVAL_INTERP) {
604     CeedInt P = basis->P1d, Q = basis->Q1d;
605     if (tmode == CEED_TRANSPOSE) {
606       P = basis->Q1d; Q = basis->P1d;
607     }
608     CeedInt pre = ncomp*CeedPowInt(P, dim-1), post = 1;
609     CeedScalar tmp[2][ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1)];
610     CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
611               ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1));
612     for (CeedInt d=0; d<dim; d++) {
613       ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, basis->interp1d,
614                                       tmode, add&&(d==dim-1),
615                                       d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
616       CeedChk(ierr);
617       pre /= P;
618       post *= Q;
619     }
620     if (tmode == CEED_NOTRANSPOSE) {
621       v += nqpt;
622     } else {
623       u += nqpt;
624     }
625   }
626   if (emode & CEED_EVAL_GRAD) {
627     CeedInt P = basis->P1d, Q = basis->Q1d;
628     // In CEED_NOTRANSPOSE mode:
629     // u is (P^dim x nc), column-major layout (nc = ncomp)
630     // v is (Q^dim x nc x dim), column-major layout (nc = ncomp)
631     // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
632     if (tmode == CEED_TRANSPOSE) {
633       P = basis->Q1d, Q = basis->P1d;
634     }
635     CeedScalar tmp[2][ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1)];
636     CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
637               ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1));
638     for (CeedInt p = 0; p < dim; p++) {
639       CeedInt pre = ncomp*CeedPowInt(P, dim-1), post = 1;
640       for (CeedInt d=0; d<dim; d++) {
641         ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q,
642                                         (p==d)?basis->grad1d:basis->interp1d,
643                                         tmode, add&&(d==dim-1),
644                                         d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
645         CeedChk(ierr);
646         pre /= P;
647         post *= Q;
648       }
649       if (tmode == CEED_NOTRANSPOSE) {
650         v += nqpt;
651       } else {
652         u += nqpt;
653       }
654     }
655   }
656   if (emode & CEED_EVAL_WEIGHT) {
657     if (tmode == CEED_TRANSPOSE)
658       return CeedError(basis->ceed, 1,
659                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
660     CeedInt Q = basis->Q1d;
661     for (CeedInt d=0; d<dim; d++) {
662       CeedInt pre = CeedPowInt(Q, dim-d-1), post = CeedPowInt(Q, d);
663       for (CeedInt i=0; i<pre; i++) {
664         for (CeedInt j=0; j<Q; j++) {
665           for (CeedInt k=0; k<post; k++) {
666             v[(i*Q + j)*post + k] = basis->qweight1d[j]
667                                     * (d == 0 ? 1 : v[(i*Q + j)*post + k]);
668           }
669         }
670       }
671     }
672   }
673   if (U) {
674     ierr = CeedVectorRestoreArrayRead(U, &u); CeedChk(ierr);
675   }
676   ierr = CeedVectorRestoreArray(V, &v); CeedChk(ierr);
677   return 0;
678 }
679 
680 static int CeedBasisDestroy_Magma(CeedBasis basis) {
681   return 0;
682 }
683 
684 static int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d,
685     CeedInt Q1d, const CeedScalar *interp1d,
686     const CeedScalar *grad1d,
687     const CeedScalar *qref1d,
688     const CeedScalar *qweight1d,
689     CeedBasis basis) {
690   basis->Apply = CeedBasisApply_Magma;
691   basis->Destroy = CeedBasisDestroy_Magma;
692   return 0;
693 }
694 
695 static int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim,
696                                    CeedInt ndof, CeedInt nqpts,
697                                    const CeedScalar *interp,
698                                    const CeedScalar *grad,
699                                    const CeedScalar *qref,
700                                    const CeedScalar *qweight,
701                                    CeedBasis basis) {
702   return CeedError(basis->ceed, 1, "Backend does not implement non-tensor bases");
703 }
704 
705 static int CeedQFunctionApply_Magma(CeedQFunction qf, CeedInt Q,
706                                     CeedVector *U, CeedVector *V) {
707   int ierr;
708   CeedQFunction_Ref *impl;
709   ierr = CeedQFunctionGetData(qf, (void *)&impl); CeedChk(ierr);
710 
711   void *ctx;
712   ierr = CeedQFunctionGetContext(qf, &ctx); CeedChk(ierr);
713 
714   int (*f)() = NULL;
715   ierr = CeedQFunctionGetUserFunction(qf, (int (* *)())&f); CeedChk(ierr);
716 
717   CeedInt nIn, nOut;
718   ierr = CeedQFunctionGetNumArgs(qf, &nIn, &nOut); CeedChk(ierr);
719 
720   for (int i = 0; i<nIn; i++) {
721     if (U[i]) {
722       ierr = CeedVectorGetArrayRead(U[i], CEED_MEM_HOST, &impl->inputs[i]);
723       CeedChk(ierr);
724     }
725   }
726   for (int i = 0; i<nOut; i++) {
727     if (U[i]) {
728       ierr = CeedVectorGetArray(V[i], CEED_MEM_HOST, &impl->outputs[i]);
729       CeedChk(ierr);
730     }
731   }
732 
733   ierr = f(ctx, Q, impl->inputs, impl->outputs); CeedChk(ierr);
734 
735   for (int i = 0; i<nIn; i++) {
736     if (U[i]) {
737       ierr = CeedVectorRestoreArrayRead(U[i], &impl->inputs[i]); CeedChk(ierr);
738     }
739   }
740   for (int i = 0; i<nOut; i++) {
741     if (U[i]) {
742       ierr = CeedVectorRestoreArray(V[i], &impl->outputs[i]); CeedChk(ierr);
743     }
744   }
745   return 0;
746 }
747 
748 static int CeedQFunctionDestroy_Magma(CeedQFunction qf) {
749   int ierr;
750   CeedQFunction_Magma *impl;
751   ierr = CeedQFunctionGetData(qf, (void *)&impl); CeedChk(ierr);
752 
753   ierr = CeedFree(&impl->inputs); CeedChk(ierr);
754   ierr = CeedFree(&impl->outputs); CeedChk(ierr);
755   ierr = CeedFree(&impl); CeedChk(ierr);
756 
757   return 0;
758 }
759 
760 static int CeedQFunctionCreate_Magma(CeedQFunction qf) {
761   int ierr;
762   Ceed ceed;
763   ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChk(ierr);
764 
765   CeedQFunction_Magma *impl;
766   ierr = CeedCalloc(1, &impl); CeedChk(ierr);
767   ierr = CeedCalloc(16, &impl->inputs); CeedChk(ierr);
768   ierr = CeedCalloc(16, &impl->outputs); CeedChk(ierr);
769   ierr = CeedQFunctionSetData(qf, (void *)&impl); CeedChk(ierr);
770 
771   qf->Apply = CeedQFunctionApply_Magma;
772   qf->Destroy = CeedQFunctionDestroy_Magma;
773   return 0;
774 }
775 
776 static int CeedOperatorDestroy_Magma(CeedOperator op) {
777   int ierr;
778   CeedOperator_Magma *impl;
779   ierr = CeedOperatorGetData(op, (void *)&impl); CeedChk(ierr);
780 
781   for (CeedInt i=0; i<impl->numein+impl->numeout; i++) {
782     if (impl->Evecs[i]) {
783       ierr = CeedVectorDestroy(&impl->Evecs[i]); CeedChk(ierr);
784     }
785   }
786   ierr = CeedFree(&impl->Evecs); CeedChk(ierr);
787   ierr = CeedFree(&impl->Edata); CeedChk(ierr);
788 
789   for (CeedInt i=0; i<impl->numein; i++) {
790     ierr = CeedVectorDestroy(&impl->evecsin[i]); CeedChk(ierr);
791     ierr = CeedVectorDestroy(&impl->qvecsin[i]); CeedChk(ierr);
792   }
793   ierr = CeedFree(&impl->evecsin); CeedChk(ierr);
794   ierr = CeedFree(&impl->qvecsin); CeedChk(ierr);
795 
796   for (CeedInt i=0; i<impl->numeout; i++) {
797     ierr = CeedVectorDestroy(&impl->evecsout[i]); CeedChk(ierr);
798     ierr = CeedVectorDestroy(&impl->qvecsout[i]); CeedChk(ierr);
799   }
800   ierr = CeedFree(&impl->evecsout); CeedChk(ierr);
801   ierr = CeedFree(&impl->qvecsout); CeedChk(ierr);
802 
803 
804   ierr = CeedFree(&impl); CeedChk(ierr);
805   return 0;
806 }
807 
808 
809 /*
810   Setup infields or outfields
811  */
812 static int CeedOperatorSetupFields_Magma(CeedQFunction qf, CeedOperator op,
813     bool inOrOut,
814     CeedVector *fullevecs, CeedVector *evecs,
815     CeedVector *qvecs, CeedInt starte,
816     CeedInt numfields, CeedInt Q) {
817   CeedInt dim = 1, ierr, ncomp;
818   Ceed ceed;
819   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
820   CeedQFunction_Magma *qf_data;
821   ierr = CeedQFunctionGetData(qf, (void *)&qf_data); CeedChk(ierr);
822   CeedBasis basis;
823   CeedElemRestriction Erestrict;
824   CeedOperatorField *opfields;
825   CeedQFunctionField *qffields;
826   if (inOrOut) {
827     ierr = CeedOperatorGetFields(op, NULL, &opfields);
828     CeedChk(ierr);
829     ierr = CeedQFunctionGetFields(qf, NULL, &qffields);
830     CeedChk(ierr);
831   } else {
832     ierr = CeedOperatorGetFields(op, &opfields, NULL);
833     CeedChk(ierr);
834     ierr = CeedQFunctionGetFields(qf, &qffields, NULL);
835     CeedChk(ierr);
836   }
837 
838   // Loop over fields
839   for (CeedInt i=0; i<numfields; i++) {
840     CeedEvalMode emode;
841     ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChk(ierr);
842     if (emode != CEED_EVAL_WEIGHT) {
843       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict);
844       CeedChk(ierr);
845       ierr = CeedElemRestrictionCreateVector(Erestrict, NULL, &fullevecs[i+starte]);
846       CeedChk(ierr);
847     } else {
848     }
849     switch(emode) {
850     case CEED_EVAL_NONE:
851       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
852       CeedChk(ierr);
853       ierr = CeedVectorCreate(ceed, Q*ncomp, &evecs[i]); CeedChk(ierr);
854       ierr = CeedVectorCreate(ceed, Q*ncomp, &qvecs[i]); CeedChk(ierr);
855       break; // No action
856     case CEED_EVAL_INTERP:
857       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChk(ierr);
858       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
859       CeedChk(ierr);
860       ierr = CeedVectorCreate(ceed, Q*ncomp, &evecs[i]); CeedChk(ierr);
861       ierr = CeedVectorCreate(ceed, Q*ncomp, &qvecs[i]); CeedChk(ierr);
862       break;
863     case CEED_EVAL_GRAD:
864       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChk(ierr);
865       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
866       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
867       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
868       CeedChk(ierr);
869       ierr = CeedVectorCreate(ceed, Q*ncomp, &evecs[i]); CeedChk(ierr);
870       ierr = CeedVectorCreate(ceed, Q*ncomp*dim, &qvecs[i]); CeedChk(ierr);
871       break;
872     case CEED_EVAL_WEIGHT: // Only on input fields
873       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChk(ierr);
874       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
875       ierr = CeedVectorCreate(ceed, Q, &qvecs[i]); CeedChk(ierr);
876       ierr = CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT,
877                             NULL, qvecs[i]); CeedChk(ierr);
878       assert(starte==0);
879       break;
880     case CEED_EVAL_DIV: break; // Not implemented
881     case CEED_EVAL_CURL: break; // Not implemented
882     }
883   }
884   return 0;
885 }
886 
887 /*
888   CeedOperator needs to connect all the named fields (be they active or passive)
889   to the named inputs and outputs of its CeedQFunction.
890  */
891 static int CeedOperatorSetup_Magma(CeedOperator op) {
892   int ierr;
893   bool setupdone;
894   ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
895   if (setupdone) return 0;
896   Ceed ceed;
897   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
898   CeedOperator_Magma *data;
899   ierr = CeedOperatorGetData(op, (void *)&data); CeedChk(ierr);
900   CeedQFunction qf;
901   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
902   CeedInt Q, numinputfields, numoutputfields;
903   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
904   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
905   CeedOperatorField *opinputfields, *opoutputfields;
906   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
907   CeedChk(ierr);
908   CeedQFunctionField *qfinputfields, *qfoutputfields;
909   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
910   CeedChk(ierr);
911 
912   data->numein = numinputfields;
913   data->numeout = numoutputfields;
914 
915   // Allocate
916   const CeedInt numIO = numinputfields + numoutputfields;
917 
918   ierr = CeedCalloc(numinputfields + numoutputfields, &data->Evecs);
919   CeedChk(ierr);
920   ierr = CeedCalloc(numinputfields + numoutputfields, &data->Edata);
921   CeedChk(ierr);
922 
923   ierr = CeedCalloc(16, &data->evecsin); CeedChk(ierr);
924   ierr = CeedCalloc(16, &data->evecsout); CeedChk(ierr);
925   ierr = CeedCalloc(16, &data->qvecsin); CeedChk(ierr);
926   ierr = CeedCalloc(16, &data->qvecsout); CeedChk(ierr);
927 
928   // Set up infield and outfield pointer arrays
929   // Infields
930   ierr = CeedOperatorSetupFields_Magma(qf, op, 0, data->Evecs,
931                                        data->evecsin, data->qvecsin, 0,
932                                        numinputfields, Q);
933   CeedChk(ierr);
934   // Outfields
935   ierr = CeedOperatorSetupFields_Magma(qf, op, 1, data->Evecs,
936                                        data->evecsout, data->qvecsout,
937                                        numinputfields, numoutputfields, Q);
938   CeedChk(ierr);
939   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
940   return 0;
941 }
942 
943 static int CeedOperatorApply_Magma(CeedOperator op, CeedVector invec,
944                                    CeedVector outvec, CeedRequest *request) {
945   int ierr;
946   Ceed ceed;
947   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
948   CeedOperator_Magma *data;
949   ierr = CeedOperatorGetData(op, (void *)&data); CeedChk(ierr);
950   //CeedVector *E = data->Evecs, *D = data->D, outvec;
951   CeedInt Q, elemsize, numelements, numinputfields, numoutputfields, ncomp;
952   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
953   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
954   CeedQFunction qf;
955   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
956   CeedTransposeMode lmode;
957   CeedOperatorField *opinputfields, *opoutputfields;
958   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
959   CeedChk(ierr);
960   CeedQFunctionField *qfinputfields, *qfoutputfields;
961   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
962   CeedChk(ierr);
963   CeedEvalMode emode;
964   CeedVector vec;
965   CeedBasis basis;
966   CeedElemRestriction Erestrict;
967 
968   ierr = CeedOperatorSetup_Magma(op); CeedChk(ierr);
969   ierr= CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
970   CeedChk(ierr);
971 
972   // Input Evecs and Restriction
973   for (CeedInt i=0; i<numinputfields; i++) {
974     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
975     CeedChk(ierr);
976     if (emode & CEED_EVAL_WEIGHT) {
977     } else { // Restriction
978       // Get input vector
979       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr);
980       if (vec == CEED_VECTOR_ACTIVE)
981         vec = invec;
982       // Restrict
983       ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
984       CeedChk(ierr);
985       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
986       ierr = CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE,
987                                       lmode, vec, data->Evecs[i],
988                                       request); CeedChk(ierr);
989       // Get evec
990       ierr = CeedVectorGetArrayRead(data->Evecs[i], CEED_MEM_HOST,
991                                     (const CeedScalar **) &data->Edata[i]);
992       CeedChk(ierr);
993     }
994   }
995 
996   // Output Evecs
997   for (CeedInt i=0; i<numoutputfields; i++) {
998     ierr = CeedVectorGetArray(data->Evecs[i+data->numein], CEED_MEM_HOST,
999                               &data->Edata[i + numinputfields]); CeedChk(ierr);
1000   }
1001 
1002   // Loop through elements
1003   for (CeedInt e=0; e<numelements; e++) {
1004     // Input basis apply if needed
1005     for (CeedInt i=0; i<numinputfields; i++) {
1006       // Get elemsize, emode, ncomp
1007       ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1008       CeedChk(ierr);
1009       ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1010       CeedChk(ierr);
1011       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1012       CeedChk(ierr);
1013       ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp);
1014       CeedChk(ierr);
1015       // Basis action
1016       switch(emode) {
1017       case CEED_EVAL_NONE:
1018         ierr = CeedVectorSetArray(data->qvecsin[i], CEED_MEM_HOST,
1019                                   CEED_USE_POINTER,
1020                                   &data->Edata[i][e*Q*ncomp]); CeedChk(ierr);
1021         break;
1022       case CEED_EVAL_INTERP:
1023         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1024         ierr = CeedVectorSetArray(data->evecsin[i], CEED_MEM_HOST,
1025                                   CEED_USE_POINTER,
1026                                   &data->Edata[i][e*elemsize*ncomp]);
1027         CeedChk(ierr);
1028         ierr = CeedBasisApply(basis, 1, CEED_NOTRANSPOSE,
1029                               CEED_EVAL_INTERP, data->evecsin[i],
1030                               data->qvecsin[i]); CeedChk(ierr);
1031         break;
1032       case CEED_EVAL_GRAD:
1033         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1034         ierr = CeedVectorSetArray(data->evecsin[i], CEED_MEM_HOST,
1035                                   CEED_USE_POINTER,
1036                                   &data->Edata[i][e*elemsize*ncomp]);
1037         CeedChk(ierr);
1038         ierr = CeedBasisApply(basis, 1, CEED_NOTRANSPOSE,
1039                               CEED_EVAL_GRAD, data->evecsin[i],
1040                               data->qvecsin[i]); CeedChk(ierr);
1041         break;
1042       case CEED_EVAL_WEIGHT:
1043         break;  // No action
1044       case CEED_EVAL_DIV:
1045         break; // Not implemented
1046       case CEED_EVAL_CURL:
1047         break; // Not implemented
1048       }
1049     }
1050     // Output pointers
1051     for (CeedInt i=0; i<numoutputfields; i++) {
1052       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1053       CeedChk(ierr);
1054       if (emode == CEED_EVAL_NONE) {
1055         ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
1056         CeedChk(ierr);
1057         ierr = CeedVectorSetArray(data->qvecsout[i], CEED_MEM_HOST,
1058                                   CEED_USE_POINTER,
1059                                   &data->Edata[i + numinputfields][e*Q*ncomp]);
1060         CeedChk(ierr);
1061       }
1062       if (emode == CEED_EVAL_INTERP) {
1063       }
1064       if (emode == CEED_EVAL_GRAD) {
1065       }
1066       if (emode == CEED_EVAL_WEIGHT) {
1067       }
1068     }
1069 
1070     // Q function
1071     ierr = CeedQFunctionApply(qf, Q, data->qvecsin, data->qvecsout); CeedChk(ierr);
1072 
1073     // Output basis apply if needed
1074     for (CeedInt i=0; i<numoutputfields; i++) {
1075       // Get elemsize, emode, ncomp
1076       ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1077       CeedChk(ierr);
1078       ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1079       CeedChk(ierr);
1080       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1081       CeedChk(ierr);
1082       ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
1083       CeedChk(ierr);
1084       // Basis action
1085       switch(emode) {
1086       case CEED_EVAL_NONE:
1087         break; // No action
1088       case CEED_EVAL_INTERP:
1089         ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
1090         CeedChk(ierr);
1091         ierr = CeedVectorSetArray(data->evecsout[i], CEED_MEM_HOST,
1092                                   CEED_USE_POINTER,
1093                                   &data->Edata[i + numinputfields][e*elemsize*ncomp]);
1094         ierr = CeedBasisApply(basis, 1, CEED_TRANSPOSE,
1095                               CEED_EVAL_INTERP, data->qvecsout[i],
1096                               data->evecsout[i]); CeedChk(ierr);
1097         break;
1098       case CEED_EVAL_GRAD:
1099         ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
1100         CeedChk(ierr);
1101         ierr = CeedVectorSetArray(data->evecsout[i], CEED_MEM_HOST,
1102                                   CEED_USE_POINTER,
1103                                   &data->Edata[i + numinputfields][e*elemsize*ncomp]);
1104         ierr = CeedBasisApply(basis, 1, CEED_TRANSPOSE,
1105                               CEED_EVAL_GRAD, data->qvecsout[i],
1106                               data->evecsout[i]); CeedChk(ierr);
1107         break;
1108       case CEED_EVAL_WEIGHT: break; // Should not occur
1109       case CEED_EVAL_DIV: break; // Not implemented
1110       case CEED_EVAL_CURL: break; // Not implemented
1111       }
1112     }
1113   } // numelements
1114 
1115   // Zero lvecs
1116   for (CeedInt i=0; i<numoutputfields; i++) {
1117     ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr);
1118     if (vec == CEED_VECTOR_ACTIVE)
1119       vec = outvec;
1120     ierr = CeedVectorSetValue(vec, 0.0); CeedChk(ierr);
1121   }
1122 
1123   // Output restriction
1124   for (CeedInt i=0; i<numoutputfields; i++) {
1125     // Restore evec
1126     ierr = CeedVectorRestoreArray(data->Evecs[i+data->numein],
1127                                   &data->Edata[i + numinputfields]);
1128     CeedChk(ierr);
1129     // Get output vector
1130     ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr);
1131     // Active
1132     if (vec == CEED_VECTOR_ACTIVE)
1133       vec = outvec;
1134     // Restrict
1135     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1136     CeedChk(ierr);
1137     ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
1138     ierr = CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE,
1139                                     lmode, data->Evecs[i+data->numein], vec,
1140                                     request); CeedChk(ierr);
1141   }
1142 
1143   // Restore input arrays
1144   for (CeedInt i=0; i<numinputfields; i++) {
1145     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1146     CeedChk(ierr);
1147     if (emode & CEED_EVAL_WEIGHT) {
1148     } else {
1149       // Restriction
1150       ierr = CeedVectorRestoreArrayRead(data->Evecs[i],
1151                                         (const CeedScalar **) &data->Edata[i]);
1152       CeedChk(ierr);
1153     }
1154   }
1155   return 0;
1156 }
1157 
1158 static int CeedOperatorCreate_Magma(CeedOperator op) {
1159   int ierr;
1160   CeedOperator_Magma *impl;
1161   Ceed ceed;
1162   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1163 
1164   ierr = CeedCalloc(1, &impl); CeedChk(ierr);
1165   ierr = CeedOperatorSetData(op, (void *)&impl);
1166 
1167   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Apply",
1168                                 CeedOperatorApply_Magma); CeedChk(ierr);
1169   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy",
1170                                 CeedOperatorDestroy_Magma); CeedChk(ierr);
1171   return 0;
1172 }
1173 
1174 int CeedCompositeOperatorCreate_Magma(CeedOperator op) {
1175   int ierr;
1176   Ceed ceed;
1177   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1178   return CeedError(ceed, 1, "Backend does not support composite operators");
1179 }
1180 
1181 // *****************************************************************************
1182 // * INIT
1183 // *****************************************************************************
1184 static int CeedInit_Magma(const char *resource, Ceed ceed) {
1185   int ierr;
1186   if (strcmp(resource, "/gpu/magma"))
1187     return CeedError(ceed, 1, "MAGMA backend cannot use resource: %s", resource);
1188 
1189   ierr = magma_init();
1190   if (ierr) return CeedError(ceed, 1, "error in magma_init(): %d\n", ierr);
1191   //magma_print_environment();
1192 
1193   ceed->VecCreate = CeedVectorCreate_Magma;
1194   ceed->BasisCreateTensorH1 = CeedBasisCreateTensorH1_Magma;
1195   ceed->BasisCreateH1 = CeedBasisCreateH1_Magma;
1196   ceed->ElemRestrictionCreate = CeedElemRestrictionCreate_Magma;
1197   ceed->ElemRestrictionCreateBlocked = CeedElemRestrictionCreateBlocked_Magma;
1198   ceed->QFunctionCreate = CeedQFunctionCreate_Magma;
1199   ceed->OperatorCreate = CeedOperatorCreate_Magma;
1200   ceed->CompositeOperatorCreate = CeedCompositeOperatorCreate_Magma;
1201   return 0;
1202 }
1203 
1204 // *****************************************************************************
1205 // * REGISTER
1206 // *****************************************************************************
1207 __attribute__((constructor))
1208 static void Register(void) {
1209   CeedRegister("/gpu/magma", CeedInit_Magma, 20);
1210 }
1211