xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma.c (revision 288c044332e33f37503f09b6484fec9d0a55fba1)
1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
3 // reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include "ceed-magma.h"
18 #include <string.h>
19 
20 typedef struct {
21   CeedScalar *array;
22   CeedScalar *darray;
23   int  own_;
24   int down_;
25 } CeedVector_Magma;
26 
27 typedef struct {
28   CeedInt *indices;
29   CeedInt *dindices;
30   int  own_;
31   int down_;            // cover a case where we own Device memory
32 } CeedElemRestriction_Magma;
33 
34 typedef struct {
35   const CeedScalar **inputs;
36   CeedScalar **outputs;
37 } CeedQFunction_Magma;
38 
39 typedef struct {
40   CeedVector *Evecs; /// E-vectors needed to apply operator (in followed by out)
41   CeedScalar **Edata;
42   CeedVector *evecsin;   /// Input E-vectors needed to apply operator
43   CeedVector *evecsout;   /// Output E-vectors needed to apply operator
44   CeedVector *qvecsin;   /// Input Q-vectors needed to apply operator
45   CeedVector *qvecsout;   /// Output Q-vectors needed to apply operator
46   CeedInt    numein;
47   CeedInt    numeout;
48 } CeedOperator_Magma;
49 
50 // *****************************************************************************
51 // * Initialize vector vec (after free mem) with values from array based on cmode
52 // *   CEED_COPY_VALUES: memory is allocated in vec->array_allocated, made equal
53 // *                     to array, and data is copied (not store passed pointer)
54 // *   CEED_OWN_POINTER: vec->data->array_allocated and vec->data->array = array
55 // *   CEED_USE_POINTER: vec->data->array = array (can modify; no ownership)
56 // * mtype: CEED_MEM_HOST or CEED_MEM_DEVICE
57 // *****************************************************************************
58 static int CeedVectorSetArray_Magma(CeedVector vec, CeedMemType mtype,
59                                     CeedCopyMode cmode, CeedScalar *array) {
60   CeedVector_Magma *impl = vec->data;
61   int ierr;
62 
63   // If own data, free the "old" data, e.g., as it may be of different size
64   if (impl->own_) {
65     magma_free( impl->darray );
66     magma_free_pinned( impl->array );
67     impl->darray = NULL;
68     impl->array  = NULL;
69     impl->own_ = 0;
70     impl->down_= 0;
71   }
72 
73   if (mtype == CEED_MEM_HOST) {
74     // memory is on the host; own_ = 0
75     switch (cmode) {
76     case CEED_COPY_VALUES:
77       ierr = magma_malloc( (void **)&impl->darray,
78                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
79       ierr = magma_malloc_pinned( (void **)&impl->array,
80                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
81       impl->own_ = 1;
82 
83       if (array != NULL)
84         magma_setvector(vec->length, sizeof(array[0]),
85                         array, 1, impl->darray, 1);
86       break;
87     case CEED_OWN_POINTER:
88       ierr = magma_malloc( (void **)&impl->darray,
89                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
90       // TODO: possible problem here is if we are passed non-pinned memory;
91       //       (as we own it, lter in destroy, we use free for pinned memory).
92       impl->array = array;
93       impl->own_ = 1;
94 
95       if (array != NULL)
96         magma_setvector(vec->length, sizeof(array[0]),
97                         array, 1, impl->darray, 1);
98       break;
99     case CEED_USE_POINTER:
100       ierr = magma_malloc( (void **)&impl->darray,
101                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
102       magma_setvector(vec->length, sizeof(array[0]),
103                       array, 1, impl->darray, 1);
104 
105       impl->down_  = 1;
106       impl->array  = array;
107     }
108   } else if (mtype == CEED_MEM_DEVICE) {
109     // memory is on the device; own = 0
110     switch (cmode) {
111     case CEED_COPY_VALUES:
112       ierr = magma_malloc( (void **)&impl->darray,
113                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
114       ierr = magma_malloc_pinned( (void **)&impl->array,
115                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
116       impl->own_ = 1;
117 
118       if (array)
119         magma_copyvector(vec->length, sizeof(array[0]),
120                          array, 1, impl->darray, 1);
121       else
122         // t30 assumes allocation initializes with 0s
123         magma_setvector(vec->length, sizeof(array[0]),
124                         impl->array, 1, impl->darray, 1);
125       break;
126     case CEED_OWN_POINTER:
127       impl->darray = array;
128       ierr = magma_malloc_pinned( (void **)&impl->array,
129                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
130       impl->own_ = 1;
131 
132       break;
133     case CEED_USE_POINTER:
134       impl->darray = array;
135       impl->array  = NULL;
136     }
137 
138   } else
139     return CeedError(vec->ceed, 1, "Only MemType = HOST or DEVICE supported");
140 
141   return 0;
142 }
143 
144 // *****************************************************************************
145 // * Give data pointer from vector vec to array (on HOST or DEVICE)
146 // *****************************************************************************
147 static int CeedVectorGetArray_Magma(CeedVector vec, CeedMemType mtype,
148                                     CeedScalar **array) {
149   CeedVector_Magma *impl = vec->data;
150   int ierr;
151 
152   if (mtype == CEED_MEM_HOST) {
153     if (impl->own_) {
154       // data is owned so GPU had the most up-to-date version; copy it
155       // TTT - apparantly it doesn't have most up to date data
156       magma_getvector(vec->length, sizeof(*array[0]),
157                       impl->darray, 1, impl->array, 1);
158       CeedDebug("\033[31m[CeedVectorGetArray_Magma]");
159       //fprintf(stderr,"rrrrrrrrrrrrrrr\n");
160     } else if (impl->array == NULL) {
161       // Vector doesn't own the data and was set on GPU
162       if (impl->darray == NULL) {
163         // call was made just to allocate memory
164         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
165         CeedChk(ierr);
166       } else
167         return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
168     }
169     *array = impl->array;
170   } else if (mtype == CEED_MEM_DEVICE) {
171     if (impl->darray == NULL) {
172       // Vector doesn't own the data and was set on the CPU
173       if (impl->array == NULL) {
174         // call was made just to allocate memory
175         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
176         CeedChk(ierr);
177       } else
178         return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
179     }
180     *array = impl->darray;
181   } else
182     return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
183 
184   return 0;
185 }
186 
187 // *****************************************************************************
188 // * Give data pointer from vector vec to array (on HOST or DEVICE) to read it
189 // *****************************************************************************
190 static int CeedVectorGetArrayRead_Magma(CeedVector vec, CeedMemType mtype,
191                                         const CeedScalar **array) {
192   CeedVector_Magma *impl = vec->data;
193   int ierr;
194 
195   if (mtype == CEED_MEM_HOST) {
196     if (impl->own_) {
197       // data is owned so GPU had the most up-to-date version; copy it
198       magma_getvector(vec->length, sizeof(*array[0]),
199                       impl->darray, 1, impl->array, 1);
200     } else if (impl->array == NULL) {
201       // Vector doesn't own the data and was set on GPU
202       if (impl->darray == NULL) {
203         // call was made just to allocate memory
204         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
205         CeedChk(ierr);
206       } else
207         return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
208     }
209     *array = impl->array;
210   } else if (mtype == CEED_MEM_DEVICE) {
211     if (impl->darray == NULL) {
212       // Vector doesn't own the data and was set on the CPU
213       if (impl->array == NULL) {
214         // call was made just to allocate memory
215         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
216         CeedChk(ierr);
217       } else
218         return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
219     }
220     *array = impl->darray;
221   } else
222     return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
223 
224   return 0;
225 }
226 
227 // *****************************************************************************
228 // * There is no mtype here for array so it is not clear if we restore from HOST
229 // * memory or from DEVICE memory. We assume that it is CPU memory because if
230 // * it was GPU memory we would not call this routine at all.
231 // * Restore vector vec with values from array, where array received its values
232 // * from vec and possibly modified them.
233 // *****************************************************************************
234 static int CeedVectorRestoreArray_Magma(CeedVector vec) {
235   CeedVector_Magma *impl = vec->data;
236 
237   if (impl->down_) {
238     // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
239     magma_getvector(vec->length, sizeof(*array[0]),
240                     impl->darray, 1, impl->array, 1);
241   }
242   return 0;
243 }
244 
245 // *****************************************************************************
246 // * There is no mtype here for array so it is not clear if we restore from HOST
247 // * memory or from DEVICE memory. We assume that it is CPU memory because if
248 // * it was GPU memory we would not call this routine at all.
249 // * Restore vector vec with values from array, where array received its values
250 // * from vec to only read them; in this case vec may have been modified meanwhile
251 // * and needs to be restored here.
252 // *****************************************************************************
253 static int CeedVectorRestoreArrayRead_Magma(CeedVector vec) {
254   CeedVector_Magma *impl = vec->data;
255 
256   if (impl->down_) {
257     // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
258     magma_getvector(vec->length, sizeof(*array[0]),
259                     impl->darray, 1, impl->array, 1);
260   }
261   return 0;
262 }
263 
264 static int CeedVectorDestroy_Magma(CeedVector vec) {
265   CeedVector_Magma *impl = vec->data;
266   int ierr;
267 
268   // Free if we own the data
269   if (impl->own_) {
270     ierr = magma_free_pinned(impl->array); CeedChk(ierr);
271     ierr = magma_free(impl->darray);       CeedChk(ierr);
272   } else if (impl->down_) {
273     ierr = magma_free(impl->darray);       CeedChk(ierr);
274   }
275   ierr = CeedFree(&vec->data); CeedChk(ierr);
276   return 0;
277 }
278 
279 // *****************************************************************************
280 // * Create vector vec of size n
281 // *****************************************************************************
282 static int CeedVectorCreate_Magma(CeedInt n, CeedVector vec) {
283   CeedVector_Magma *impl;
284   int ierr;
285 
286   vec->SetArray = CeedVectorSetArray_Magma;
287   vec->GetArray = CeedVectorGetArray_Magma;
288   vec->GetArrayRead = CeedVectorGetArrayRead_Magma;
289   vec->RestoreArray = CeedVectorRestoreArray_Magma;
290   vec->RestoreArrayRead = CeedVectorRestoreArrayRead_Magma;
291   vec->Destroy = CeedVectorDestroy_Magma;
292   ierr = CeedCalloc(1,&impl); CeedChk(ierr);
293   impl->darray = NULL;
294   impl->array  = NULL;
295   impl->own_ = 0;
296   impl->down_= 0;
297   vec->data = impl;
298   return 0;
299 }
300 
301 
302 // *****************************************************************************
303 // * Apply restriction operator r to u: v = r(rmode) u
304 // *****************************************************************************
305 static int CeedElemRestrictionApply_Magma(CeedElemRestriction r,
306     CeedTransposeMode tmode,
307     CeedTransposeMode lmode, CeedVector u,
308     CeedVector v, CeedRequest *request) {
309   CeedElemRestriction_Magma *impl = r->data;
310   int ierr;
311   const CeedScalar *uu;
312   CeedScalar *vv;
313   CeedInt nelem = r->nelem, elemsize = r->elemsize, ndof = r->ndof,
314           ncomp=r->ncomp;
315   CeedInt esize = nelem * elemsize;
316 
317   #ifdef USE_MAGMA_BATCH2
318   CeedInt *dindices = impl->dindices;
319   // Get pointers on the device
320   ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &uu); CeedChk(ierr);
321   ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &vv); CeedChk(ierr);
322   #else
323   CeedInt *indices = impl->indices;
324   ierr = CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu); CeedChk(ierr);
325   ierr = CeedVectorGetArray(v, CEED_MEM_HOST, &vv); CeedChk(ierr);
326   #endif
327 
328   if (tmode == CEED_NOTRANSPOSE) {
329     // Perform: v = r * u
330     if (!impl->indices) {
331       for (CeedInt i=0; i<esize*ncomp; i++) vv[i] = uu[i];
332     } else if (ncomp == 1) {
333       #ifdef USE_MAGMA_BATCH2
334 magma_template<<i=0:esize>>
335       (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
336         vv[i] = uu[dindices[i]];
337       }
338       #else
339       for (CeedInt i=0; i<esize; i++) vv[i] = uu[indices[i]];
340       #endif
341     } else {
342       // vv is (elemsize x ncomp x nelem), column-major
343       if (lmode == CEED_NOTRANSPOSE) { // u is (ndof x ncomp), column-major
344         #ifdef USE_MAGMA_BATCH2
345 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
346         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, int ndof) {
347           vv[i + iend*(d+dend*e)] = uu[dindices[i+iend*e]+ndof*d];
348         }
349         #else
350         for (CeedInt e = 0; e < nelem; e++)
351           for (CeedInt d = 0; d < ncomp; d++)
352             for (CeedInt i=0; i < elemsize; i++) {
353               vv[i + elemsize*(d+ncomp*e)] =
354                 uu[indices[i+elemsize*e]+ndof*d];
355             }
356         #endif
357       } else { // u is (ncomp x ndof), column-major
358         #ifdef USE_MAGMA_BATCH2
359 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
360         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
361           vv[i + iend*(d+dend*e)] = uu[d+dend*dindices[i + iend*e]];
362         }
363         #else
364         for (CeedInt e = 0; e < nelem; e++)
365           for (CeedInt d = 0; d < ncomp; d++)
366             for (CeedInt i=0; i< elemsize; i++) {
367               vv[i + elemsize*(d+ncomp*e)] =
368                 uu[d+ncomp*indices[i+elemsize*e]];
369             }
370         #endif
371       }
372     }
373   } else {
374     // Note: in transpose mode, we perform: v += r^t * u
375     if (!impl->indices) {
376       for (CeedInt i=0; i<esize; i++) vv[i] += uu[i];
377     } else if (ncomp == 1) {
378       // fprintf(stderr,"3 ---------\n");
379       #ifdef USE_MAGMA_BATCH2
380 magma_template<<i=0:esize>>
381       (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
382         magmablas_datomic_add( &vv[dindices[i]], uu[i]);
383       }
384       #else
385       for (CeedInt i=0; i<esize; i++) vv[indices[i]] += uu[i];
386       #endif
387     } else { // u is (elemsize x ncomp x nelem)
388       fprintf(stderr,"2 ---------\n");
389 
390       if (lmode == CEED_NOTRANSPOSE) { // vv is (ndof x ncomp), column-major
391         #ifdef USE_MAGMA_BATCH2
392 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
393         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, CeedInt ndof) {
394           magmablas_datomic_add( &vv[dindices[i+iend*e]+ndof*d],
395                                  uu[i+iend*(d+e*dend)]);
396         }
397         #else
398         for (CeedInt e = 0; e < nelem; e++)
399           for (CeedInt d = 0; d < ncomp; d++)
400             for (CeedInt i=0; i < elemsize; i++) {
401               vv[indices[i + elemsize*e]+ndof*d] +=
402                 uu[i + elemsize*(d+e*ncomp)];
403             }
404         #endif
405       } else { // vv is (ncomp x ndof), column-major
406         #ifdef USE_MAGMA_BATCH2
407 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
408         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
409           magmablas_datomic_add( &vv[d+dend*dindices[i + iend*e]],
410                                  uu[i+iend*(d+e*dend)]);
411         }
412         #else
413         for (CeedInt e = 0; e < nelem; e++)
414           for (CeedInt d = 0; d < ncomp; d++)
415             for (CeedInt i=0; i < elemsize; i++) {
416               vv[d+ncomp*indices[i + elemsize*e]] +=
417                 uu[i + elemsize*(d+e*ncomp)];
418             }
419         #endif
420       }
421     }
422   }
423 
424   ierr = CeedVectorRestoreArrayRead(u, &uu); CeedChk(ierr);
425   ierr = CeedVectorRestoreArray(v, &vv); CeedChk(ierr);
426 
427   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
428     *request = NULL;
429   return 0;
430 }
431 
432 int CeedElemRestrictionApplyBlock_Magma(CeedElemRestriction r,
433                                         CeedInt block, CeedTransposeMode tmode,
434                                         CeedTransposeMode lmode, CeedVector u,
435                                         CeedVector v, CeedRequest *request) {
436   int ierr;
437   Ceed ceed;
438   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChk(ierr);
439   return CeedError(ceed, 1, "Backend does not implement blocked restrictions");
440 }
441 
442 static int CeedElemRestrictionDestroy_Magma(CeedElemRestriction r) {
443   CeedElemRestriction_Magma *impl = r->data;
444   int ierr;
445 
446   // Free if we own the data
447   if (impl->own_) {
448     ierr = magma_free_pinned(impl->indices); CeedChk(ierr);
449     ierr = magma_free(impl->dindices);       CeedChk(ierr);
450   } else if (impl->down_) {
451     ierr = magma_free(impl->dindices);       CeedChk(ierr);
452   }
453   ierr = CeedFree(&r->data); CeedChk(ierr);
454   return 0;
455 }
456 
457 static int CeedElemRestrictionCreate_Magma(CeedMemType mtype,
458     CeedCopyMode cmode,
459     const CeedInt *indices, CeedElemRestriction r) {
460   int ierr, size = r->nelem*r->elemsize;
461   CeedElemRestriction_Magma *impl;
462 
463   // Allocate memory for the MAGMA Restricton and initializa pointers to NULL
464   ierr = CeedCalloc(1,&impl); CeedChk(ierr);
465   impl->dindices = NULL;
466   impl->indices  = NULL;
467   impl->own_ = 0;
468   impl->down_= 0;
469 
470   if (mtype == CEED_MEM_HOST) {
471     // memory is on the host; own_ = 0
472     switch (cmode) {
473     case CEED_COPY_VALUES:
474       ierr = magma_malloc( (void **)&impl->dindices,
475                            size * sizeof(CeedInt)); CeedChk(ierr);
476       ierr = magma_malloc_pinned( (void **)&impl->indices,
477                                   size * sizeof(CeedInt)); CeedChk(ierr);
478       impl->own_ = 1;
479 
480       if (indices != NULL) {
481         memcpy(impl->indices, indices, size * sizeof(indices[0]));
482         magma_setvector(size, sizeof(CeedInt),
483                         impl->indices, 1, impl->dindices, 1);
484       }
485       break;
486     case CEED_OWN_POINTER:
487       ierr = magma_malloc( (void **)&impl->dindices,
488                            size * sizeof(CeedInt)); CeedChk(ierr);
489       // TODO: possible problem here is if we are passed non-pinned memory;
490       //       (as we own it, lter in destroy, we use free for pinned memory).
491       impl->indices = (CeedInt *)indices;
492       impl->own_ = 1;
493 
494       if (indices != NULL)
495         magma_setvector(size, sizeof(CeedInt),
496                         indices, 1, impl->dindices, 1);
497       break;
498     case CEED_USE_POINTER:
499       ierr = magma_malloc( (void **)&impl->dindices,
500                            size * sizeof(CeedInt)); CeedChk(ierr);
501       magma_setvector(size, sizeof(CeedInt),
502                       indices, 1, impl->dindices, 1);
503       impl->down_ = 1;
504       impl->indices  = (CeedInt *)indices;
505     }
506   } else if (mtype == CEED_MEM_DEVICE) {
507     // memory is on the device; own = 0
508     switch (cmode) {
509     case CEED_COPY_VALUES:
510       ierr = magma_malloc( (void **)&impl->dindices,
511                            size * sizeof(CeedInt)); CeedChk(ierr);
512       ierr = magma_malloc_pinned( (void **)&impl->indices,
513                                   size * sizeof(CeedInt)); CeedChk(ierr);
514       impl->own_ = 1;
515 
516       if (indices)
517         magma_copyvector(size, sizeof(CeedInt),
518                          indices, 1, impl->dindices, 1);
519       break;
520     case CEED_OWN_POINTER:
521       impl->dindices = (CeedInt *)indices;
522       ierr = magma_malloc_pinned( (void **)&impl->indices,
523                                   size * sizeof(CeedInt)); CeedChk(ierr);
524       impl->own_ = 1;
525 
526       break;
527     case CEED_USE_POINTER:
528       impl->dindices = (CeedInt *)indices;
529       impl->indices  = NULL;
530     }
531 
532   } else
533     return CeedError(r->ceed, 1, "Only MemType = HOST or DEVICE supported");
534 
535   r->data    = impl;
536   r->Apply   = CeedElemRestrictionApply_Magma;
537   r->ApplyBlock = CeedElemRestrictionApplyBlock_Magma;
538   r->Destroy = CeedElemRestrictionDestroy_Magma;
539 
540   return 0;
541 }
542 
543 static int CeedElemRestrictionCreateBlocked_Magma(CeedMemType mtype,
544     CeedCopyMode cmode,
545     const CeedInt *indices, CeedElemRestriction r) {
546   return CeedError(r->ceed, 1, "Backend does not implement blocked restrictions");
547 }
548 
549 // Contracts on the middle index
550 // NOTRANSPOSE: V_ajc = T_jb U_abc
551 // TRANSPOSE:   V_ajc = T_bj U_abc
552 // If Add != 0, "=" is replaced by "+="
553 static int CeedTensorContract_Magma(Ceed ceed,
554                                     CeedInt A, CeedInt B, CeedInt C, CeedInt J,
555                                     const CeedScalar *t, CeedTransposeMode tmode,
556                                     const CeedInt Add,
557                                     const CeedScalar *u, CeedScalar *v) {
558   #ifdef USE_MAGMA_BATCH
559   magma_dtensor_contract(ceed, A, B, C, J, t, tmode, Add, u, v);
560   #else
561   CeedInt tstride0 = B, tstride1 = 1;
562   if (tmode == CEED_TRANSPOSE) {
563     tstride0 = 1; tstride1 = J;
564   }
565   CeedDebug("\033[31m[CeedTensorContract] A=%d, J=%d, C=%d, B=%d: %d %d %d",
566             A,J,C,B,A*J*B*C, C*J*A, C*B*A);
567   for (CeedInt a=0; a<A; a++) {
568     for (CeedInt j=0; j<J; j++) {
569       if (!Add) {
570         for (CeedInt c=0; c<C; c++)
571           v[(a*J+j)*C+c] = 0;
572       }
573       for (CeedInt b=0; b<B; b++) {
574         for (CeedInt c=0; c<C; c++) {
575           v[(a*J+j)*C+c] += t[j*tstride0 + b*tstride1] * u[(a*B+b)*C+c];
576         }
577       }
578     }
579   }
580   #endif
581   return 0;
582 }
583 
584 static int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem,
585                                 CeedTransposeMode tmode, CeedEvalMode emode,
586                                 CeedVector U, CeedVector V) {
587   int ierr;
588   const CeedInt dim = basis->dim;
589   const CeedInt ncomp = basis->ncomp;
590   const CeedInt nqpt = ncomp*CeedPowInt(basis->Q1d, dim);
591   const CeedInt add = (tmode == CEED_TRANSPOSE);
592   const CeedScalar *u;
593   CeedScalar *v;
594   if (U) {
595     ierr = CeedVectorGetArrayRead(U, CEED_MEM_HOST, &u); CeedChk(ierr);
596   } else if (emode != CEED_EVAL_WEIGHT) {
597     return CeedError(ceed, 1,
598                      "An input vector is required for this CeedEvalMode");
599   }
600   ierr = CeedVectorGetArray(V, CEED_MEM_HOST, &v); CeedChk(ierr);
601 
602   if (nelem != 1)
603     return CeedError(basis->ceed, 1,
604                      "This backend does not support BasisApply for multiple elements");
605 
606   CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d",
607             ncomp*CeedPowInt(basis->P1d, dim));
608 
609   if (tmode == CEED_TRANSPOSE) {
610     const CeedInt vsize = ncomp*CeedPowInt(basis->P1d, dim);
611     for (CeedInt i = 0; i < vsize; i++)
612       v[i] = (CeedScalar) 0;
613   }
614   if (emode & CEED_EVAL_INTERP) {
615     CeedInt P = basis->P1d, Q = basis->Q1d;
616     if (tmode == CEED_TRANSPOSE) {
617       P = basis->Q1d; Q = basis->P1d;
618     }
619     CeedInt pre = ncomp*CeedPowInt(P, dim-1), post = 1;
620     CeedScalar tmp[2][ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1)];
621     CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
622               ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1));
623     for (CeedInt d=0; d<dim; d++) {
624       ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, basis->interp1d,
625                                       tmode, add&&(d==dim-1),
626                                       d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
627       CeedChk(ierr);
628       pre /= P;
629       post *= Q;
630     }
631     if (tmode == CEED_NOTRANSPOSE) {
632       v += nqpt;
633     } else {
634       u += nqpt;
635     }
636   }
637   if (emode & CEED_EVAL_GRAD) {
638     CeedInt P = basis->P1d, Q = basis->Q1d;
639     // In CEED_NOTRANSPOSE mode:
640     // u is (P^dim x nc), column-major layout (nc = ncomp)
641     // v is (Q^dim x nc x dim), column-major layout (nc = ncomp)
642     // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
643     if (tmode == CEED_TRANSPOSE) {
644       P = basis->Q1d, Q = basis->P1d;
645     }
646     CeedScalar tmp[2][ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1)];
647     CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
648               ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1));
649     for (CeedInt p = 0; p < dim; p++) {
650       CeedInt pre = ncomp*CeedPowInt(P, dim-1), post = 1;
651       for (CeedInt d=0; d<dim; d++) {
652         ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q,
653                                         (p==d)?basis->grad1d:basis->interp1d,
654                                         tmode, add&&(d==dim-1),
655                                         d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
656         CeedChk(ierr);
657         pre /= P;
658         post *= Q;
659       }
660       if (tmode == CEED_NOTRANSPOSE) {
661         v += nqpt;
662       } else {
663         u += nqpt;
664       }
665     }
666   }
667   if (emode & CEED_EVAL_WEIGHT) {
668     if (tmode == CEED_TRANSPOSE)
669       return CeedError(basis->ceed, 1,
670                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
671     CeedInt Q = basis->Q1d;
672     for (CeedInt d=0; d<dim; d++) {
673       CeedInt pre = CeedPowInt(Q, dim-d-1), post = CeedPowInt(Q, d);
674       for (CeedInt i=0; i<pre; i++) {
675         for (CeedInt j=0; j<Q; j++) {
676           for (CeedInt k=0; k<post; k++) {
677             v[(i*Q + j)*post + k] = basis->qweight1d[j]
678                                     * (d == 0 ? 1 : v[(i*Q + j)*post + k]);
679           }
680         }
681       }
682     }
683   }
684   if (U) {
685     ierr = CeedVectorRestoreArrayRead(U, &u); CeedChk(ierr);
686   }
687   ierr = CeedVectorRestoreArray(V, &v); CeedChk(ierr);
688   return 0;
689 }
690 
691 static int CeedBasisDestroy_Magma(CeedBasis basis) {
692   return 0;
693 }
694 
695 static int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d,
696     CeedInt Q1d, const CeedScalar *interp1d,
697     const CeedScalar *grad1d,
698     const CeedScalar *qref1d,
699     const CeedScalar *qweight1d,
700     CeedBasis basis) {
701   basis->Apply = CeedBasisApply_Magma;
702   basis->Destroy = CeedBasisDestroy_Magma;
703   return 0;
704 }
705 
706 static int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim,
707                                    CeedInt ndof, CeedInt nqpts,
708                                    const CeedScalar *interp,
709                                    const CeedScalar *grad,
710                                    const CeedScalar *qref,
711                                    const CeedScalar *qweight,
712                                    CeedBasis basis) {
713   return CeedError(basis->ceed, 1, "Backend does not implement non-tensor bases");
714 }
715 
716 static int CeedQFunctionApply_Magma(CeedQFunction qf, CeedInt Q,
717                                     CeedVector *U, CeedVector *V) {
718   int ierr;
719   CeedQFunction_Ref *impl;
720   ierr = CeedQFunctionGetData(qf, (void *)&impl); CeedChk(ierr);
721 
722   void *ctx;
723   ierr = CeedQFunctionGetContext(qf, &ctx); CeedChk(ierr);
724 
725   int (*f)() = NULL;
726   ierr = CeedQFunctionGetUserFunction(qf, (int (* *)())&f); CeedChk(ierr);
727 
728   CeedInt nIn, nOut;
729   ierr = CeedQFunctionGetNumArgs(qf, &nIn, &nOut); CeedChk(ierr);
730 
731   for (int i = 0; i<nIn; i++) {
732     if (U[i]) {
733       ierr = CeedVectorGetArrayRead(U[i], CEED_MEM_HOST, &impl->inputs[i]);
734       CeedChk(ierr);
735     }
736   }
737   for (int i = 0; i<nOut; i++) {
738     if (U[i]) {
739       ierr = CeedVectorGetArray(V[i], CEED_MEM_HOST, &impl->outputs[i]);
740       CeedChk(ierr);
741     }
742   }
743 
744   ierr = f(ctx, Q, impl->inputs, impl->outputs); CeedChk(ierr);
745 
746   for (int i = 0; i<nIn; i++) {
747     if (U[i]) {
748       ierr = CeedVectorRestoreArrayRead(U[i], &impl->inputs[i]); CeedChk(ierr);
749     }
750   }
751   for (int i = 0; i<nOut; i++) {
752     if (U[i]) {
753       ierr = CeedVectorRestoreArray(V[i], &impl->outputs[i]); CeedChk(ierr);
754     }
755   }
756   return 0;
757 }
758 
759 static int CeedQFunctionDestroy_Magma(CeedQFunction qf) {
760   int ierr;
761   CeedQFunction_Magma *impl;
762   ierr = CeedQFunctionGetData(qf, (void *)&impl); CeedChk(ierr);
763 
764   ierr = CeedFree(&impl->inputs); CeedChk(ierr);
765   ierr = CeedFree(&impl->outputs); CeedChk(ierr);
766   ierr = CeedFree(&impl); CeedChk(ierr);
767 
768   return 0;
769 }
770 
771 static int CeedQFunctionCreate_Magma(CeedQFunction qf) {
772   int ierr;
773   Ceed ceed;
774   ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChk(ierr);
775 
776   CeedQFunction_Magma *impl;
777   ierr = CeedCalloc(1, &impl); CeedChk(ierr);
778   ierr = CeedCalloc(16, &impl->inputs); CeedChk(ierr);
779   ierr = CeedCalloc(16, &impl->outputs); CeedChk(ierr);
780   ierr = CeedQFunctionSetData(qf, (void *)&impl); CeedChk(ierr);
781 
782   qf->Apply = CeedQFunctionApply_Magma;
783   qf->Destroy = CeedQFunctionDestroy_Magma;
784   return 0;
785 }
786 
787 static int CeedOperatorDestroy_Magma(CeedOperator op) {
788   int ierr;
789   CeedOperator_Magma *impl;
790   ierr = CeedOperatorGetData(op, (void *)&impl); CeedChk(ierr);
791 
792   for (CeedInt i=0; i<impl->numein+impl->numeout; i++) {
793     if (impl->Evecs[i]) {
794       ierr = CeedVectorDestroy(&impl->Evecs[i]); CeedChk(ierr);
795     }
796   }
797   ierr = CeedFree(&impl->Evecs); CeedChk(ierr);
798   ierr = CeedFree(&impl->Edata); CeedChk(ierr);
799 
800   for (CeedInt i=0; i<impl->numein; i++) {
801     ierr = CeedVectorDestroy(&impl->evecsin[i]); CeedChk(ierr);
802     ierr = CeedVectorDestroy(&impl->qvecsin[i]); CeedChk(ierr);
803   }
804   ierr = CeedFree(&impl->evecsin); CeedChk(ierr);
805   ierr = CeedFree(&impl->qvecsin); CeedChk(ierr);
806 
807   for (CeedInt i=0; i<impl->numeout; i++) {
808     ierr = CeedVectorDestroy(&impl->evecsout[i]); CeedChk(ierr);
809     ierr = CeedVectorDestroy(&impl->qvecsout[i]); CeedChk(ierr);
810   }
811   ierr = CeedFree(&impl->evecsout); CeedChk(ierr);
812   ierr = CeedFree(&impl->qvecsout); CeedChk(ierr);
813 
814 
815   ierr = CeedFree(&impl); CeedChk(ierr);
816   return 0;
817 }
818 
819 
820 /*
821   Setup infields or outfields
822  */
823 static int CeedOperatorSetupFields_Magma(CeedQFunction qf, CeedOperator op,
824     bool inOrOut,
825     CeedVector *fullevecs, CeedVector *evecs,
826     CeedVector *qvecs, CeedInt starte,
827     CeedInt numfields, CeedInt Q) {
828   CeedInt dim = 1, ierr, ncomp;
829   Ceed ceed;
830   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
831   CeedQFunction_Magma *qf_data;
832   ierr = CeedQFunctionGetData(qf, (void *)&qf_data); CeedChk(ierr);
833   CeedBasis basis;
834   CeedElemRestriction Erestrict;
835   CeedOperatorField *opfields;
836   CeedQFunctionField *qffields;
837   if (inOrOut) {
838     ierr = CeedOperatorGetFields(op, NULL, &opfields);
839     CeedChk(ierr);
840     ierr = CeedQFunctionGetFields(qf, NULL, &qffields);
841     CeedChk(ierr);
842   } else {
843     ierr = CeedOperatorGetFields(op, &opfields, NULL);
844     CeedChk(ierr);
845     ierr = CeedQFunctionGetFields(qf, &qffields, NULL);
846     CeedChk(ierr);
847   }
848 
849   // Loop over fields
850   for (CeedInt i=0; i<numfields; i++) {
851     CeedEvalMode emode;
852     ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChk(ierr);
853     if (emode != CEED_EVAL_WEIGHT) {
854       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict);
855       CeedChk(ierr);
856       ierr = CeedElemRestrictionCreateVector(Erestrict, NULL, &fullevecs[i+starte]);
857       CeedChk(ierr);
858     } else {
859     }
860     switch(emode) {
861     case CEED_EVAL_NONE:
862       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
863       CeedChk(ierr);
864       ierr = CeedVectorCreate(ceed, Q*ncomp, &evecs[i]); CeedChk(ierr);
865       ierr = CeedVectorCreate(ceed, Q*ncomp, &qvecs[i]); CeedChk(ierr);
866       break; // No action
867     case CEED_EVAL_INTERP:
868       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChk(ierr);
869       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
870       CeedChk(ierr);
871       ierr = CeedVectorCreate(ceed, Q*ncomp, &evecs[i]); CeedChk(ierr);
872       ierr = CeedVectorCreate(ceed, Q*ncomp, &qvecs[i]); CeedChk(ierr);
873       break;
874     case CEED_EVAL_GRAD:
875       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChk(ierr);
876       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
877       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
878       ierr = CeedQFunctionFieldGetNumComponents(qffields[i], &ncomp);
879       CeedChk(ierr);
880       ierr = CeedVectorCreate(ceed, Q*ncomp, &evecs[i]); CeedChk(ierr);
881       ierr = CeedVectorCreate(ceed, Q*ncomp*dim, &qvecs[i]); CeedChk(ierr);
882       break;
883     case CEED_EVAL_WEIGHT: // Only on input fields
884       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChk(ierr);
885       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
886       ierr = CeedVectorCreate(ceed, Q, &qvecs[i]); CeedChk(ierr);
887       ierr = CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT,
888                             NULL, qvecs[i]); CeedChk(ierr);
889       assert(starte==0);
890       break;
891     case CEED_EVAL_DIV: break; // Not implemented
892     case CEED_EVAL_CURL: break; // Not implemented
893     }
894   }
895   return 0;
896 }
897 
898 /*
899   CeedOperator needs to connect all the named fields (be they active or passive)
900   to the named inputs and outputs of its CeedQFunction.
901  */
902 static int CeedOperatorSetup_Magma(CeedOperator op) {
903   int ierr;
904   bool setupdone;
905   ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
906   if (setupdone) return 0;
907   Ceed ceed;
908   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
909   CeedOperator_Magma *data;
910   ierr = CeedOperatorGetData(op, (void *)&data); CeedChk(ierr);
911   CeedQFunction qf;
912   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
913   CeedInt Q, numinputfields, numoutputfields;
914   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
915   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
916   CeedOperatorField *opinputfields, *opoutputfields;
917   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
918   CeedChk(ierr);
919   CeedQFunctionField *qfinputfields, *qfoutputfields;
920   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
921   CeedChk(ierr);
922 
923   data->numein = numinputfields;
924   data->numeout = numoutputfields;
925 
926   // Allocate
927   const CeedInt numIO = numinputfields + numoutputfields;
928 
929   ierr = CeedCalloc(numinputfields + numoutputfields, &data->Evecs);
930   CeedChk(ierr);
931   ierr = CeedCalloc(numinputfields + numoutputfields, &data->Edata);
932   CeedChk(ierr);
933 
934   ierr = CeedCalloc(16, &data->evecsin); CeedChk(ierr);
935   ierr = CeedCalloc(16, &data->evecsout); CeedChk(ierr);
936   ierr = CeedCalloc(16, &data->qvecsin); CeedChk(ierr);
937   ierr = CeedCalloc(16, &data->qvecsout); CeedChk(ierr);
938 
939   // Set up infield and outfield pointer arrays
940   // Infields
941   ierr = CeedOperatorSetupFields_Magma(qf, op, 0, data->Evecs,
942                                        data->evecsin, data->qvecsin, 0,
943                                        numinputfields, Q);
944   CeedChk(ierr);
945   // Outfields
946   ierr = CeedOperatorSetupFields_Magma(qf, op, 1, data->Evecs,
947                                        data->evecsout, data->qvecsout,
948                                        numinputfields, numoutputfields, Q);
949   CeedChk(ierr);
950   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
951   return 0;
952 }
953 
954 static int CeedOperatorApply_Magma(CeedOperator op, CeedVector invec,
955                                    CeedVector outvec, CeedRequest *request) {
956   int ierr;
957   Ceed ceed;
958   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
959   CeedOperator_Magma *data;
960   ierr = CeedOperatorGetData(op, (void *)&data); CeedChk(ierr);
961   //CeedVector *E = data->Evecs, *D = data->D, outvec;
962   CeedInt Q, elemsize, numelements, numinputfields, numoutputfields, ncomp;
963   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
964   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
965   CeedQFunction qf;
966   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
967   CeedTransposeMode lmode;
968   CeedOperatorField *opinputfields, *opoutputfields;
969   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
970   CeedChk(ierr);
971   CeedQFunctionField *qfinputfields, *qfoutputfields;
972   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
973   CeedChk(ierr);
974   CeedEvalMode emode;
975   CeedVector vec;
976   CeedBasis basis;
977   CeedElemRestriction Erestrict;
978 
979   ierr = CeedOperatorSetup_Magma(op); CeedChk(ierr);
980   ierr= CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
981   CeedChk(ierr);
982 
983   // Input Evecs and Restriction
984   for (CeedInt i=0; i<numinputfields; i++) {
985     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
986     CeedChk(ierr);
987     if (emode & CEED_EVAL_WEIGHT) {
988     } else { // Restriction
989       // Get input vector
990       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr);
991       if (vec == CEED_VECTOR_ACTIVE)
992         vec = invec;
993       // Restrict
994       ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
995       CeedChk(ierr);
996       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
997       ierr = CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE,
998                                       lmode, vec, data->Evecs[i],
999                                       request); CeedChk(ierr);
1000       // Get evec
1001       ierr = CeedVectorGetArrayRead(data->Evecs[i], CEED_MEM_HOST,
1002                                     (const CeedScalar **) &data->Edata[i]);
1003       CeedChk(ierr);
1004     }
1005   }
1006 
1007   // Output Evecs
1008   for (CeedInt i=0; i<numoutputfields; i++) {
1009     ierr = CeedVectorGetArray(data->Evecs[i+data->numein], CEED_MEM_HOST,
1010                               &data->Edata[i + numinputfields]); CeedChk(ierr);
1011   }
1012 
1013   // Loop through elements
1014   for (CeedInt e=0; e<numelements; e++) {
1015     // Input basis apply if needed
1016     for (CeedInt i=0; i<numinputfields; i++) {
1017       // Get elemsize, emode, ncomp
1018       ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1019       CeedChk(ierr);
1020       ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1021       CeedChk(ierr);
1022       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1023       CeedChk(ierr);
1024       ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp);
1025       CeedChk(ierr);
1026       // Basis action
1027       switch(emode) {
1028       case CEED_EVAL_NONE:
1029         ierr = CeedVectorSetArray(data->qvecsin[i], CEED_MEM_HOST,
1030                                   CEED_USE_POINTER,
1031                                   &data->Edata[i][e*Q*ncomp]); CeedChk(ierr);
1032         break;
1033       case CEED_EVAL_INTERP:
1034         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1035         ierr = CeedVectorSetArray(data->evecsin[i], CEED_MEM_HOST,
1036                                   CEED_USE_POINTER,
1037                                   &data->Edata[i][e*elemsize*ncomp]);
1038         CeedChk(ierr);
1039         ierr = CeedBasisApply(basis, 1, CEED_NOTRANSPOSE,
1040                               CEED_EVAL_INTERP, data->evecsin[i],
1041                               data->qvecsin[i]); CeedChk(ierr);
1042         break;
1043       case CEED_EVAL_GRAD:
1044         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1045         ierr = CeedVectorSetArray(data->evecsin[i], CEED_MEM_HOST,
1046                                   CEED_USE_POINTER,
1047                                   &data->Edata[i][e*elemsize*ncomp]);
1048         CeedChk(ierr);
1049         ierr = CeedBasisApply(basis, 1, CEED_NOTRANSPOSE,
1050                               CEED_EVAL_GRAD, data->evecsin[i],
1051                               data->qvecsin[i]); CeedChk(ierr);
1052         break;
1053       case CEED_EVAL_WEIGHT:
1054         break;  // No action
1055       case CEED_EVAL_DIV:
1056         break; // Not implemented
1057       case CEED_EVAL_CURL:
1058         break; // Not implemented
1059       }
1060     }
1061     // Output pointers
1062     for (CeedInt i=0; i<numoutputfields; i++) {
1063       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1064       CeedChk(ierr);
1065       if (emode == CEED_EVAL_NONE) {
1066         ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
1067         CeedChk(ierr);
1068         ierr = CeedVectorSetArray(data->qvecsout[i], CEED_MEM_HOST,
1069                                   CEED_USE_POINTER,
1070                                   &data->Edata[i + numinputfields][e*Q*ncomp]);
1071         CeedChk(ierr);
1072       }
1073       if (emode == CEED_EVAL_INTERP) {
1074       }
1075       if (emode == CEED_EVAL_GRAD) {
1076       }
1077       if (emode == CEED_EVAL_WEIGHT) {
1078       }
1079     }
1080 
1081     // Q function
1082     ierr = CeedQFunctionApply(qf, Q, data->qvecsin, data->qvecsout); CeedChk(ierr);
1083 
1084     // Output basis apply if needed
1085     for (CeedInt i=0; i<numoutputfields; i++) {
1086       // Get elemsize, emode, ncomp
1087       ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1088       CeedChk(ierr);
1089       ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1090       CeedChk(ierr);
1091       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1092       CeedChk(ierr);
1093       ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
1094       CeedChk(ierr);
1095       // Basis action
1096       switch(emode) {
1097       case CEED_EVAL_NONE:
1098         break; // No action
1099       case CEED_EVAL_INTERP:
1100         ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
1101         CeedChk(ierr);
1102         ierr = CeedVectorSetArray(data->evecsout[i], CEED_MEM_HOST,
1103                                   CEED_USE_POINTER,
1104                                   &data->Edata[i + numinputfields][e*elemsize*ncomp]);
1105         ierr = CeedBasisApply(basis, 1, CEED_TRANSPOSE,
1106                               CEED_EVAL_INTERP, data->qvecsout[i],
1107                               data->evecsout[i]); CeedChk(ierr);
1108         break;
1109       case CEED_EVAL_GRAD:
1110         ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
1111         CeedChk(ierr);
1112         ierr = CeedVectorSetArray(data->evecsout[i], CEED_MEM_HOST,
1113                                   CEED_USE_POINTER,
1114                                   &data->Edata[i + numinputfields][e*elemsize*ncomp]);
1115         ierr = CeedBasisApply(basis, 1, CEED_TRANSPOSE,
1116                               CEED_EVAL_GRAD, data->qvecsout[i],
1117                               data->evecsout[i]); CeedChk(ierr);
1118         break;
1119       case CEED_EVAL_WEIGHT: break; // Should not occur
1120       case CEED_EVAL_DIV: break; // Not implemented
1121       case CEED_EVAL_CURL: break; // Not implemented
1122       }
1123     }
1124   } // numelements
1125 
1126   // Zero lvecs
1127   for (CeedInt i=0; i<numoutputfields; i++) {
1128     ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr);
1129     if (vec == CEED_VECTOR_ACTIVE)
1130       vec = outvec;
1131     ierr = CeedVectorSetValue(vec, 0.0); CeedChk(ierr);
1132   }
1133 
1134   // Output restriction
1135   for (CeedInt i=0; i<numoutputfields; i++) {
1136     // Restore evec
1137     ierr = CeedVectorRestoreArray(data->Evecs[i+data->numein],
1138                                   &data->Edata[i + numinputfields]);
1139     CeedChk(ierr);
1140     // Get output vector
1141     ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr);
1142     // Active
1143     if (vec == CEED_VECTOR_ACTIVE)
1144       vec = outvec;
1145     // Restrict
1146     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1147     CeedChk(ierr);
1148     ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
1149     ierr = CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE,
1150                                     lmode, data->Evecs[i+data->numein], vec,
1151                                     request); CeedChk(ierr);
1152   }
1153 
1154   // Restore input arrays
1155   for (CeedInt i=0; i<numinputfields; i++) {
1156     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1157     CeedChk(ierr);
1158     if (emode & CEED_EVAL_WEIGHT) {
1159     } else {
1160       // Restriction
1161       ierr = CeedVectorRestoreArrayRead(data->Evecs[i],
1162                                         (const CeedScalar **) &data->Edata[i]);
1163       CeedChk(ierr);
1164     }
1165   }
1166   return 0;
1167 }
1168 
1169 static int CeedOperatorCreate_Magma(CeedOperator op) {
1170   int ierr;
1171   CeedOperator_Magma *impl;
1172   Ceed ceed;
1173   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1174 
1175   ierr = CeedCalloc(1, &impl); CeedChk(ierr);
1176   ierr = CeedOperatorSetData(op, (void *)&impl);
1177 
1178   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Apply",
1179                                 CeedOperatorApply_Magma); CeedChk(ierr);
1180   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy",
1181                                 CeedOperatorDestroy_Magma); CeedChk(ierr);
1182   return 0;
1183 }
1184 
1185 int CeedCompositeOperatorCreate_Magma(CeedOperator op) {
1186   int ierr;
1187   Ceed ceed;
1188   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1189   return CeedError(ceed, 1, "Backend does not support composite operators");
1190 }
1191 
1192 // *****************************************************************************
1193 // * INIT
1194 // *****************************************************************************
1195 static int CeedInit_Magma(const char *resource, Ceed ceed) {
1196   int ierr;
1197   if (strcmp(resource, "/gpu/magma"))
1198     return CeedError(ceed, 1, "MAGMA backend cannot use resource: %s", resource);
1199 
1200   ierr = magma_init();
1201   if (ierr) return CeedError(ceed, 1, "error in magma_init(): %d\n", ierr);
1202   //magma_print_environment();
1203 
1204   ceed->VectorCreate = CeedVectorCreate_Magma;
1205   ceed->BasisCreateTensorH1 = CeedBasisCreateTensorH1_Magma;
1206   ceed->BasisCreateH1 = CeedBasisCreateH1_Magma;
1207   ceed->ElemRestrictionCreate = CeedElemRestrictionCreate_Magma;
1208   ceed->ElemRestrictionCreateBlocked = CeedElemRestrictionCreateBlocked_Magma;
1209   ceed->QFunctionCreate = CeedQFunctionCreate_Magma;
1210   ceed->OperatorCreate = CeedOperatorCreate_Magma;
1211   ceed->CompositeOperatorCreate = CeedCompositeOperatorCreate_Magma;
1212   return 0;
1213 }
1214 
1215 // *****************************************************************************
1216 // * REGISTER
1217 // *****************************************************************************
1218 __attribute__((constructor))
1219 static void Register(void) {
1220   CeedRegister("/gpu/magma", CeedInit_Magma, 20);
1221 }
1222