xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma.c (revision a4b33194745f1db2c078785db78486bb2b14379e)
1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
3 // reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include "ceed-magma.h"
18 #include <string.h>
19 
20 typedef struct {
21   CeedScalar *array;
22   CeedScalar *darray;
23   int  own_;
24   int down_;
25 } CeedVector_Magma;
26 
27 typedef struct {
28   CeedInt *indices;
29   CeedInt *dindices;
30   int  own_;
31   int down_;            // cover a case where we own Device memory
32 } CeedElemRestriction_Magma;
33 
34 typedef struct {
35   CeedVector
36   *evecs;   /// E-vectors needed to apply operator (input followed by outputs)
37   CeedScalar **edata;
38   CeedScalar **qdata; /// Inputs followed by outputs
39   CeedScalar
40   **qdata_alloc; /// Allocated quadrature data arrays (to be freed by us)
41   CeedScalar **indata;
42   CeedScalar **outdata;
43   CeedInt    numein;
44   CeedInt    numeout;
45   CeedInt    numqin;
46   CeedInt    numqout;
47 } CeedOperator_Magma;
48 
49 // *****************************************************************************
50 // * Initialize vector vec (after free mem) with values from array based on cmode
51 // *   CEED_COPY_VALUES: memory is allocated in vec->array_allocated, made equal
52 // *                     to array, and data is copied (not store passed pointer)
53 // *   CEED_OWN_POINTER: vec->data->array_allocated and vec->data->array = array
54 // *   CEED_USE_POINTER: vec->data->array = array (can modify; no ownership)
55 // * mtype: CEED_MEM_HOST or CEED_MEM_DEVICE
56 // *****************************************************************************
57 static int CeedVectorSetArray_Magma(CeedVector vec, CeedMemType mtype,
58                                     CeedCopyMode cmode, CeedScalar *array) {
59   CeedVector_Magma *impl = vec->data;
60   int ierr;
61 
62   // If own data, free the "old" data, e.g., as it may be of different size
63   if (impl->own_) {
64     magma_free( impl->darray );
65     magma_free_pinned( impl->array );
66     impl->darray = NULL;
67     impl->array  = NULL;
68     impl->own_ = 0;
69     impl->down_= 0;
70   }
71 
72   if (mtype == CEED_MEM_HOST) {
73     // memory is on the host; own_ = 0
74     switch (cmode) {
75     case CEED_COPY_VALUES:
76       ierr = magma_malloc( (void**)&impl->darray,
77                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
78       ierr = magma_malloc_pinned( (void**)&impl->array,
79                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
80       impl->own_ = 1;
81 
82       if (array != NULL)
83         magma_setvector(vec->length, sizeof(array[0]),
84                         array, 1, impl->darray, 1);
85       break;
86     case CEED_OWN_POINTER:
87       ierr = magma_malloc( (void**)&impl->darray,
88                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
89       // TODO: possible problem here is if we are passed non-pinned memory;
90       //       (as we own it, lter in destroy, we use free for pinned memory).
91       impl->array = array;
92       impl->own_ = 1;
93 
94       if (array != NULL)
95         magma_setvector(vec->length, sizeof(array[0]),
96                         array, 1, impl->darray, 1);
97       break;
98     case CEED_USE_POINTER:
99       ierr = magma_malloc( (void**)&impl->darray,
100                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
101       magma_setvector(vec->length, sizeof(array[0]),
102                       array, 1, impl->darray, 1);
103 
104       impl->down_  = 1;
105       impl->array  = array;
106     }
107   } else if (mtype == CEED_MEM_DEVICE) {
108     // memory is on the device; own = 0
109     switch (cmode) {
110     case CEED_COPY_VALUES:
111       ierr = magma_malloc( (void**)&impl->darray,
112                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
113       ierr = magma_malloc_pinned( (void**)&impl->array,
114                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
115       impl->own_ = 1;
116 
117       if (array)
118         magma_copyvector(vec->length, sizeof(array[0]),
119                          array, 1, impl->darray, 1);
120       else
121         // t30 assumes allocation initializes with 0s
122         magma_setvector(vec->length, sizeof(array[0]),
123                         impl->array, 1, impl->darray, 1);
124       break;
125     case CEED_OWN_POINTER:
126       impl->darray = array;
127       ierr = magma_malloc_pinned( (void**)&impl->array,
128                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
129       impl->own_ = 1;
130 
131       break;
132     case CEED_USE_POINTER:
133       impl->darray = array;
134       impl->array  = NULL;
135     }
136 
137   } else
138     return CeedError(vec->ceed, 1, "Only MemType = HOST or DEVICE supported");
139 
140   return 0;
141 }
142 
143 // *****************************************************************************
144 // * Give data pointer from vector vec to array (on HOST or DEVICE)
145 // *****************************************************************************
146 static int CeedVectorGetArray_Magma(CeedVector vec, CeedMemType mtype,
147                                     CeedScalar **array) {
148   CeedVector_Magma *impl = vec->data;
149   int ierr;
150 
151   if (mtype == CEED_MEM_HOST) {
152     if (impl->own_) {
153       // data is owned so GPU had the most up-to-date version; copy it
154       // TTT - apparantly it doesn't have most up to date data
155       magma_getvector(vec->length, sizeof(*array[0]),
156                       impl->darray, 1, impl->array, 1);
157       CeedDebug("\033[31m[CeedVectorGetArray_Magma]");
158       //fprintf(stderr,"rrrrrrrrrrrrrrr\n");
159     } else if (impl->array == NULL) {
160       // Vector doesn't own the data and was set on GPU
161       if (impl->darray == NULL) {
162         // call was made just to allocate memory
163         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
164         CeedChk(ierr);
165       } else
166         return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
167     }
168     *array = impl->array;
169   } else if (mtype == CEED_MEM_DEVICE) {
170     if (impl->darray == NULL) {
171       // Vector doesn't own the data and was set on the CPU
172       if (impl->array == NULL) {
173         // call was made just to allocate memory
174         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
175         CeedChk(ierr);
176       } else
177         return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
178     }
179     *array = impl->darray;
180   } else
181     return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
182 
183   return 0;
184 }
185 
186 // *****************************************************************************
187 // * Give data pointer from vector vec to array (on HOST or DEVICE) to read it
188 // *****************************************************************************
189 static int CeedVectorGetArrayRead_Magma(CeedVector vec, CeedMemType mtype,
190                                         const CeedScalar **array) {
191   CeedVector_Magma *impl = vec->data;
192   int ierr;
193 
194   if (mtype == CEED_MEM_HOST) {
195     if (impl->own_) {
196       // data is owned so GPU had the most up-to-date version; copy it
197       magma_getvector(vec->length, sizeof(*array[0]),
198                       impl->darray, 1, impl->array, 1);
199     } else if (impl->array == NULL) {
200       // Vector doesn't own the data and was set on GPU
201       if (impl->darray == NULL) {
202         // call was made just to allocate memory
203         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
204         CeedChk(ierr);
205       } else
206         return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
207     }
208     *array = impl->array;
209   } else if (mtype == CEED_MEM_DEVICE) {
210     if (impl->darray == NULL) {
211       // Vector doesn't own the data and was set on the CPU
212       if (impl->array == NULL) {
213         // call was made just to allocate memory
214         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
215         CeedChk(ierr);
216       } else
217         return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
218     }
219     *array = impl->darray;
220   } else
221     return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
222 
223   return 0;
224 }
225 
226 // *****************************************************************************
227 // * There is no mtype here for array so it is not clear if we restore from HOST
228 // * memory or from DEVICE memory. We assume that it is CPU memory because if
229 // * it was GPU memory we would not call this routine at all.
230 // * Restore vector vec with values from array, where array received its values
231 // * from vec and possibly modified them.
232 // *****************************************************************************
233 static int CeedVectorRestoreArray_Magma(CeedVector vec, CeedScalar **array) {
234   CeedVector_Magma *impl = vec->data;
235 
236   // Check if the array is a CPU pointer
237   if (*array == impl->array) {
238     // Update device, if the device pointer is not NULL
239     if (impl->darray != NULL) {
240       magma_setvector(vec->length, sizeof(*array[0]),
241                       *array, 1, impl->darray, 1);
242     } else {
243       // nothing to do (case of CPU use pointer)
244     }
245 
246   } else if (impl->down_) {
247     // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
248     magma_getvector(vec->length, sizeof(*array[0]),
249                     impl->darray, 1, impl->array, 1);
250   }
251 
252   *array = NULL;
253   return 0;
254 }
255 
256 // *****************************************************************************
257 // * There is no mtype here for array so it is not clear if we restore from HOST
258 // * memory or from DEVICE memory. We assume that it is CPU memory because if
259 // * it was GPU memory we would not call this routine at all.
260 // * Restore vector vec with values from array, where array received its values
261 // * from vec to only read them; in this case vec may have been modified meanwhile
262 // * and needs to be restored here.
263 // *****************************************************************************
264 static int CeedVectorRestoreArrayRead_Magma(CeedVector vec,
265     const CeedScalar **array) {
266   CeedVector_Magma *impl = vec->data;
267 
268   // Check if the array is a CPU pointer
269   if (*array == impl->array) {
270     // Update device, if the device pointer is not NULL
271     if (impl->darray != NULL) {
272       magma_setvector(vec->length, sizeof(*array[0]),
273                       *array, 1, impl->darray, 1);
274     } else {
275       // nothing to do (case of CPU use pointer)
276     }
277 
278   } else if (impl->down_) {
279     // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
280     magma_getvector(vec->length, sizeof(*array[0]),
281                     impl->darray, 1, impl->array, 1);
282   }
283 
284   *array = NULL;
285   return 0;
286 }
287 
288 static int CeedVectorDestroy_Magma(CeedVector vec) {
289   CeedVector_Magma *impl = vec->data;
290   int ierr;
291 
292   // Free if we own the data
293   if (impl->own_) {
294     ierr = magma_free_pinned(impl->array); CeedChk(ierr);
295     ierr = magma_free(impl->darray);       CeedChk(ierr);
296   } else if (impl->down_) {
297     ierr = magma_free(impl->darray);       CeedChk(ierr);
298   }
299   ierr = CeedFree(&vec->data); CeedChk(ierr);
300   return 0;
301 }
302 
303 // *****************************************************************************
304 // * Create vector vec of size n
305 // *****************************************************************************
306 static int CeedVectorCreate_Magma(Ceed ceed, CeedInt n, CeedVector vec) {
307   CeedVector_Magma *impl;
308   int ierr;
309 
310   vec->SetArray = CeedVectorSetArray_Magma;
311   vec->GetArray = CeedVectorGetArray_Magma;
312   vec->GetArrayRead = CeedVectorGetArrayRead_Magma;
313   vec->RestoreArray = CeedVectorRestoreArray_Magma;
314   vec->RestoreArrayRead = CeedVectorRestoreArrayRead_Magma;
315   vec->Destroy = CeedVectorDestroy_Magma;
316   ierr = CeedCalloc(1,&impl); CeedChk(ierr);
317   impl->darray = NULL;
318   impl->array  = NULL;
319   impl->own_ = 0;
320   impl->down_= 0;
321   vec->data = impl;
322   return 0;
323 }
324 
325 
326 // *****************************************************************************
327 // * Apply restriction operator r to u: v = r(rmode) u
328 // *****************************************************************************
329 static int CeedElemRestrictionApply_Magma(CeedElemRestriction r,
330     CeedTransposeMode tmode,
331     CeedTransposeMode lmode, CeedVector u,
332     CeedVector v, CeedRequest *request) {
333   CeedElemRestriction_Magma *impl = r->data;
334   int ierr;
335   const CeedScalar *uu;
336   CeedScalar *vv;
337   CeedInt nelem = r->nelem, elemsize = r->elemsize, ndof = r->ndof,
338           ncomp=r->ncomp;
339   CeedInt esize = nelem * elemsize;
340 
341 #ifdef USE_MAGMA_BATCH2
342   CeedInt *dindices = impl->dindices;
343   // Get pointers on the device
344   ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &uu); CeedChk(ierr);
345   ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &vv); CeedChk(ierr);
346 #else
347   CeedInt *indices = impl->indices;
348   ierr = CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu); CeedChk(ierr);
349   ierr = CeedVectorGetArray(v, CEED_MEM_HOST, &vv); CeedChk(ierr);
350 #endif
351 
352   if (tmode == CEED_NOTRANSPOSE) {
353     // Perform: v = r * u
354     if (ncomp == 1) {
355 #ifdef USE_MAGMA_BATCH2
356 magma_template<<i=0:esize>>
357       (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
358         vv[i] = uu[dindices[i]];
359       }
360 #else
361       for (CeedInt i=0; i<esize; i++) vv[i] = uu[indices[i]];
362 #endif
363     } else {
364       // vv is (elemsize x ncomp x nelem), column-major
365       if (lmode == CEED_NOTRANSPOSE) { // u is (ndof x ncomp), column-major
366 #ifdef USE_MAGMA_BATCH2
367 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
368         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, int ndof) {
369           vv[i + iend*(d+dend*e)] = uu[dindices[i+iend*e]+ndof*d];
370         }
371 #else
372         for (CeedInt e = 0; e < nelem; e++)
373           for (CeedInt d = 0; d < ncomp; d++)
374             for (CeedInt i=0; i < elemsize; i++) {
375               vv[i + elemsize*(d+ncomp*e)] =
376                 uu[indices[i+elemsize*e]+ndof*d];
377             }
378 #endif
379       } else { // u is (ncomp x ndof), column-major
380 #ifdef USE_MAGMA_BATCH2
381 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
382         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
383           vv[i + iend*(d+dend*e)] = uu[d+dend*dindices[i + iend*e]];
384         }
385 #else
386         for (CeedInt e = 0; e < nelem; e++)
387           for (CeedInt d = 0; d < ncomp; d++)
388             for (CeedInt i=0; i< elemsize; i++) {
389               vv[i + elemsize*(d+ncomp*e)] =
390                 uu[d+ncomp*indices[i+elemsize*e]];
391             }
392 #endif
393       }
394     }
395   } else {
396     // Note: in transpose mode, we perform: v += r^t * u
397     if (ncomp == 1) {
398       // fprintf(stderr,"3 ---------\n");
399 #ifdef USE_MAGMA_BATCH2
400 magma_template<<i=0:esize>>
401       (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
402         magmablas_datomic_add( &vv[dindices[i]], uu[i]);
403       }
404 #else
405       for (CeedInt i=0; i<esize; i++) vv[indices[i]] += uu[i];
406 #endif
407     } else { // u is (elemsize x ncomp x nelem)
408       fprintf(stderr,"2 ---------\n");
409 
410       if (lmode == CEED_NOTRANSPOSE) { // vv is (ndof x ncomp), column-major
411 #ifdef USE_MAGMA_BATCH2
412 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
413         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, CeedInt ndof) {
414           magmablas_datomic_add( &vv[dindices[i+iend*e]+ndof*d],
415                                  uu[i+iend*(d+e*dend)]);
416         }
417 #else
418         for (CeedInt e = 0; e < nelem; e++)
419           for (CeedInt d = 0; d < ncomp; d++)
420             for (CeedInt i=0; i < elemsize; i++) {
421               vv[indices[i + elemsize*e]+ndof*d] +=
422                 uu[i + elemsize*(d+e*ncomp)];
423             }
424 #endif
425       } else { // vv is (ncomp x ndof), column-major
426 #ifdef USE_MAGMA_BATCH2
427 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
428         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
429           magmablas_datomic_add( &vv[d+dend*dindices[i + iend*e]],
430                                  uu[i+iend*(d+e*dend)]);
431         }
432 #else
433         for (CeedInt e = 0; e < nelem; e++)
434           for (CeedInt d = 0; d < ncomp; d++)
435             for (CeedInt i=0; i < elemsize; i++) {
436               vv[d+ncomp*indices[i + elemsize*e]] +=
437                 uu[i + elemsize*(d+e*ncomp)];
438             }
439 #endif
440       }
441     }
442   }
443 
444   ierr = CeedVectorRestoreArrayRead(u, &uu); CeedChk(ierr);
445   ierr = CeedVectorRestoreArray(v, &vv); CeedChk(ierr);
446 
447   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
448     *request = NULL;
449   return 0;
450 }
451 
452 static int CeedElemRestrictionDestroy_Magma(CeedElemRestriction r) {
453   CeedElemRestriction_Magma *impl = r->data;
454   int ierr;
455 
456   // Free if we own the data
457   if (impl->own_) {
458     ierr = magma_free_pinned(impl->indices); CeedChk(ierr);
459     ierr = magma_free(impl->dindices);       CeedChk(ierr);
460   } else if (impl->down_) {
461     ierr = magma_free(impl->dindices);       CeedChk(ierr);
462   }
463   ierr = CeedFree(&r->data); CeedChk(ierr);
464   return 0;
465 }
466 
467 static int CeedElemRestrictionCreate_Magma(CeedElemRestriction r,
468     CeedMemType mtype,
469     CeedCopyMode cmode,
470     const CeedInt *indices) {
471   int ierr, size = r->nelem*r->elemsize;
472   CeedElemRestriction_Magma *impl;
473 
474   // Allocate memory for the MAGMA Restricton and initializa pointers to NULL
475   ierr = CeedCalloc(1,&impl); CeedChk(ierr);
476   impl->dindices = NULL;
477   impl->indices  = NULL;
478   impl->own_ = 0;
479   impl->down_= 0;
480 
481   if (mtype == CEED_MEM_HOST) {
482     // memory is on the host; own_ = 0
483     switch (cmode) {
484     case CEED_COPY_VALUES:
485       ierr = magma_malloc( (void**)&impl->dindices,
486                            size * sizeof(CeedInt)); CeedChk(ierr);
487       ierr = magma_malloc_pinned( (void**)&impl->indices,
488                                   size * sizeof(CeedInt)); CeedChk(ierr);
489       impl->own_ = 1;
490 
491       if (indices != NULL) {
492         memcpy(impl->indices, indices, size * sizeof(indices[0]));
493         magma_setvector(size, sizeof(CeedInt),
494                         impl->indices, 1, impl->dindices, 1);
495       }
496       break;
497     case CEED_OWN_POINTER:
498       ierr = magma_malloc( (void**)&impl->dindices,
499                            size * sizeof(CeedInt)); CeedChk(ierr);
500       // TODO: possible problem here is if we are passed non-pinned memory;
501       //       (as we own it, lter in destroy, we use free for pinned memory).
502       impl->indices = (CeedInt *)indices;
503       impl->own_ = 1;
504 
505       if (indices != NULL)
506         magma_setvector(size, sizeof(CeedInt),
507                         indices, 1, impl->dindices, 1);
508       break;
509     case CEED_USE_POINTER:
510       ierr = magma_malloc( (void**)&impl->dindices,
511                            size * sizeof(CeedInt)); CeedChk(ierr);
512       magma_setvector(size, sizeof(CeedInt),
513                       indices, 1, impl->dindices, 1);
514       impl->down_ = 1;
515       impl->indices  = (CeedInt *)indices;
516     }
517   } else if (mtype == CEED_MEM_DEVICE) {
518     // memory is on the device; own = 0
519     switch (cmode) {
520     case CEED_COPY_VALUES:
521       ierr = magma_malloc( (void**)&impl->dindices,
522                            size * sizeof(CeedInt)); CeedChk(ierr);
523       ierr = magma_malloc_pinned( (void**)&impl->indices,
524                                   size * sizeof(CeedInt)); CeedChk(ierr);
525       impl->own_ = 1;
526 
527       if (indices)
528         magma_copyvector(size, sizeof(CeedInt),
529                          indices, 1, impl->dindices, 1);
530       break;
531     case CEED_OWN_POINTER:
532       impl->dindices = (CeedInt *)indices;
533       ierr = magma_malloc_pinned( (void**)&impl->indices,
534                                   size * sizeof(CeedInt)); CeedChk(ierr);
535       impl->own_ = 1;
536 
537       break;
538     case CEED_USE_POINTER:
539       impl->dindices = (CeedInt *)indices;
540       impl->indices  = NULL;
541     }
542 
543   } else
544     return CeedError(r->ceed, 1, "Only MemType = HOST or DEVICE supported");
545 
546   r->data    = impl;
547   r->Apply   = CeedElemRestrictionApply_Magma;
548   r->Destroy = CeedElemRestrictionDestroy_Magma;
549 
550   return 0;
551 }
552 
553 // Contracts on the middle index
554 // NOTRANSPOSE: V_ajc = T_jb U_abc
555 // TRANSPOSE:   V_ajc = T_bj U_abc
556 // If Add != 0, "=" is replaced by "+="
557 static int CeedTensorContract_Magma(Ceed ceed,
558                                     CeedInt A, CeedInt B, CeedInt C, CeedInt J,
559                                     const CeedScalar *t, CeedTransposeMode tmode,
560                                     const CeedInt Add,
561                                     const CeedScalar *u, CeedScalar *v) {
562 #ifdef USE_MAGMA_BATCH
563   magma_dtensor_contract(ceed, A, B, C, J, t, tmode, Add, u, v);
564 #else
565   CeedInt tstride0 = B, tstride1 = 1;
566   if (tmode == CEED_TRANSPOSE) {
567     tstride0 = 1; tstride1 = J;
568   }
569   CeedDebug("\033[31m[CeedTensorContract] A=%d, J=%d, C=%d, B=%d: %d %d %d",
570             A,J,C,B,A*J*B*C, C*J*A, C*B*A);
571   for (CeedInt a=0; a<A; a++) {
572     for (CeedInt j=0; j<J; j++) {
573       if (!Add) {
574         for (CeedInt c=0; c<C; c++)
575           v[(a*J+j)*C+c] = 0;
576       }
577       for (CeedInt b=0; b<B; b++) {
578         for (CeedInt c=0; c<C; c++) {
579           v[(a*J+j)*C+c] += t[j*tstride0 + b*tstride1] * u[(a*B+b)*C+c];
580         }
581       }
582     }
583   }
584 #endif
585   return 0;
586 }
587 
588 static int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem,
589                                 CeedTransposeMode tmode, CeedEvalMode emode,
590                                 const CeedScalar *u, CeedScalar *v) {
591   int ierr;
592   const CeedInt dim = basis->dim;
593   const CeedInt ncomp = basis->ncomp;
594   const CeedInt nqpt = ncomp*CeedPowInt(basis->Q1d, dim);
595   const CeedInt add = (tmode == CEED_TRANSPOSE);
596 
597   if (nelem != 1)
598     return CeedError(basis->ceed, 1,
599                      "This backend does not support BasisApply for multiple elements");
600 
601   CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d",
602             ncomp*CeedPowInt(basis->P1d, dim));
603 
604   if (tmode == CEED_TRANSPOSE) {
605     const CeedInt vsize = ncomp*CeedPowInt(basis->P1d, dim);
606     for (CeedInt i = 0; i < vsize; i++)
607       v[i] = (CeedScalar) 0;
608   }
609   if (emode & CEED_EVAL_INTERP) {
610     CeedInt P = basis->P1d, Q = basis->Q1d;
611     if (tmode == CEED_TRANSPOSE) {
612       P = basis->Q1d; Q = basis->P1d;
613     }
614     CeedInt pre = ncomp*CeedPowInt(P, dim-1), post = 1;
615     CeedScalar tmp[2][ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1)];
616     CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
617               ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1));
618     for (CeedInt d=0; d<dim; d++) {
619       ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, basis->interp1d,
620                                       tmode, add&&(d==dim-1),
621                                       d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
622       CeedChk(ierr);
623       pre /= P;
624       post *= Q;
625     }
626     if (tmode == CEED_NOTRANSPOSE) {
627       v += nqpt;
628     } else {
629       u += nqpt;
630     }
631   }
632   if (emode & CEED_EVAL_GRAD) {
633     CeedInt P = basis->P1d, Q = basis->Q1d;
634     // In CEED_NOTRANSPOSE mode:
635     // u is (P^dim x nc), column-major layout (nc = ncomp)
636     // v is (Q^dim x nc x dim), column-major layout (nc = ncomp)
637     // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
638     if (tmode == CEED_TRANSPOSE) {
639       P = basis->Q1d, Q = basis->P1d;
640     }
641     CeedScalar tmp[2][ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1)];
642     CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
643               ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1));
644     for (CeedInt p = 0; p < dim; p++) {
645       CeedInt pre = ncomp*CeedPowInt(P, dim-1), post = 1;
646       for (CeedInt d=0; d<dim; d++) {
647         ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q,
648                                         (p==d)?basis->grad1d:basis->interp1d,
649                                         tmode, add&&(d==dim-1),
650                                         d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
651         CeedChk(ierr);
652         pre /= P;
653         post *= Q;
654       }
655       if (tmode == CEED_NOTRANSPOSE) {
656         v += nqpt;
657       } else {
658         u += nqpt;
659       }
660     }
661   }
662   if (emode & CEED_EVAL_WEIGHT) {
663     if (tmode == CEED_TRANSPOSE)
664       return CeedError(basis->ceed, 1,
665                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
666     CeedInt Q = basis->Q1d;
667     for (CeedInt d=0; d<dim; d++) {
668       CeedInt pre = CeedPowInt(Q, dim-d-1), post = CeedPowInt(Q, d);
669       for (CeedInt i=0; i<pre; i++) {
670         for (CeedInt j=0; j<Q; j++) {
671           for (CeedInt k=0; k<post; k++) {
672             v[(i*Q + j)*post + k] = basis->qweight1d[j]
673                                     * (d == 0 ? 1 : v[(i*Q + j)*post + k]);
674           }
675         }
676       }
677     }
678   }
679   return 0;
680 }
681 
682 static int CeedBasisDestroy_Magma(CeedBasis basis) {
683   return 0;
684 }
685 
686 static int CeedBasisCreateTensorH1_Magma(Ceed ceed, CeedInt dim, CeedInt P1d,
687     CeedInt Q1d, const CeedScalar *interp1d,
688     const CeedScalar *grad1d,
689     const CeedScalar *qref1d,
690     const CeedScalar *qweight1d,
691     CeedBasis basis) {
692   basis->Apply = CeedBasisApply_Magma;
693   basis->Destroy = CeedBasisDestroy_Magma;
694   return 0;
695 }
696 
697 static int CeedQFunctionApply_Magma(CeedQFunction qf, CeedInt Q,
698                                     const CeedScalar *const *u,
699                                     CeedScalar *const *v) {
700   int ierr;
701   ierr = qf->function(qf->ctx, Q, u, v); CeedChk(ierr);
702   return 0;
703 }
704 
705 static int CeedQFunctionDestroy_Magma(CeedQFunction qf) {
706   return 0;
707 }
708 
709 static int CeedQFunctionCreate_Magma(CeedQFunction qf) {
710   qf->Apply = CeedQFunctionApply_Magma;
711   qf->Destroy = CeedQFunctionDestroy_Magma;
712   return 0;
713 }
714 
715 static int CeedOperatorDestroy_Magma(CeedOperator op) {
716   CeedOperator_Magma *impl = op->data;
717   int ierr;
718 
719   for (CeedInt i=0; i<impl->numein+impl->numeout; i++) {
720     ierr = CeedVectorDestroy(&impl->evecs[i]); CeedChk(ierr);
721   }
722   ierr = CeedFree(&impl->evecs); CeedChk(ierr);
723   ierr = CeedFree(&impl->edata); CeedChk(ierr);
724 
725   for (CeedInt i=0; i<impl->numqin+impl->numqout; i++) {
726     ierr = CeedFree(&impl->qdata_alloc[i]); CeedChk(ierr);
727   }
728   ierr = CeedFree(&impl->qdata_alloc); CeedChk(ierr);
729   ierr = CeedFree(&impl->qdata); CeedChk(ierr);
730 
731   ierr = CeedFree(&impl->indata); CeedChk(ierr);
732   ierr = CeedFree(&impl->outdata); CeedChk(ierr);
733 
734   ierr = CeedFree(&op->data); CeedChk(ierr);
735   return 0;
736 }
737 
738 
739 /*
740   Setup infields or outfields
741  */
742 static int CeedOperatorSetupFields_Magma(struct CeedQFunctionField qfields[16],
743                                        struct CeedOperatorField ofields[16],
744                                        CeedVector *evecs, CeedScalar **qdata,
745                                        CeedScalar **qdata_alloc, CeedScalar **indata,
746                                        CeedInt starti, CeedInt starte,
747                                        CeedInt startq, CeedInt numfields, CeedInt Q) {
748   CeedInt dim, ierr, ie=starte, iq=startq, ncomp;
749 
750   // Loop over fields
751   for (CeedInt i=0; i<numfields; i++) {
752     if (ofields[i].Erestrict != CEED_RESTRICTION_IDENTITY) {
753       ierr = CeedElemRestrictionCreateVector(ofields[i].Erestrict, NULL, &evecs[ie]);
754       CeedChk(ierr);
755       ie++;
756     }
757     CeedEvalMode emode = qfields[i].emode;
758     switch(emode) {
759     case CEED_EVAL_NONE:
760       break; // No action
761     case CEED_EVAL_INTERP:
762       ncomp = qfields[i].ncomp;
763       ierr = CeedMalloc(Q*ncomp, &qdata_alloc[iq]); CeedChk(ierr);
764       qdata[i + starti] = qdata_alloc[iq];
765       iq++;
766       break;
767     case CEED_EVAL_GRAD:
768       ncomp = qfields[i].ncomp;
769       dim = ofields[i].basis->dim;
770       ierr = CeedMalloc(Q*ncomp*dim, &qdata_alloc[iq]); CeedChk(ierr);
771       qdata[i + starti] = qdata_alloc[iq];
772       iq++;
773       break;
774     case CEED_EVAL_WEIGHT: // Only on input fields
775       ierr = CeedMalloc(Q, &qdata_alloc[iq]); CeedChk(ierr);
776       ierr = CeedBasisApply(ofields[iq].basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT,
777                             NULL, qdata_alloc[iq]); CeedChk(ierr);
778       qdata[i] = qdata_alloc[iq];
779       indata[i] = qdata[i];
780       iq++;
781       break;
782     case CEED_EVAL_DIV:
783       break; // Not implimented
784     case CEED_EVAL_CURL:
785       break; // Not implimented
786     }
787   }
788   return 0;
789 }
790 
791 /*
792   CeedOperator needs to connect all the named fields (be they active or passive)
793   to the named inputs and outputs of its CeedQFunction.
794  */
795 static int CeedOperatorSetup_Magma(CeedOperator op) {
796   if (op->setupdone) return 0;
797   CeedOperator_Magma *opmagma = op->data;
798   CeedQFunction qf = op->qf;
799   CeedInt Q = op->numqpoints;
800   int ierr;
801 
802   // Count infield and outfield array sizes and evectors
803   for (CeedInt i=0; i<qf->numinputfields; i++) {
804     CeedEvalMode emode = qf->inputfields[i].emode;
805     opmagma->numqin += !!(emode & CEED_EVAL_INTERP) + !!(emode & CEED_EVAL_GRAD) + !!
806                      (emode & CEED_EVAL_WEIGHT);
807     opmagma->numein +=
808       (op->inputfields[i].Erestrict != CEED_RESTRICTION_IDENTITY); // Need E-vector when non-identity restriction exists
809   }
810   for (CeedInt i=0; i<qf->numoutputfields; i++) {
811     CeedEvalMode emode = qf->outputfields[i].emode;
812     opmagma->numqout += !!(emode & CEED_EVAL_INTERP) + !!(emode & CEED_EVAL_GRAD);
813     opmagma->numeout += (op->outputfields[i].Erestrict != CEED_RESTRICTION_IDENTITY);
814   }
815 
816   // Allocate
817   ierr = CeedCalloc(opmagma->numein + opmagma->numeout, &opmagma->evecs); CeedChk(ierr);
818   ierr = CeedCalloc(qf->numinputfields + qf->numoutputfields, &opmagma->edata);
819   CeedChk(ierr);
820 
821   ierr = CeedCalloc(opmagma->numqin + opmagma->numqout, &opmagma->qdata_alloc);
822   CeedChk(ierr);
823   ierr = CeedCalloc(qf->numinputfields + qf->numoutputfields, &opmagma->qdata);
824   CeedChk(ierr);
825 
826   ierr = CeedCalloc(16, &opmagma->indata); CeedChk(ierr);
827   ierr = CeedCalloc(16, &opmagma->outdata); CeedChk(ierr);
828 
829   // Set up infield and outfield pointer arrays
830   // Infields
831   ierr = CeedOperatorSetupFields_Magma(qf->inputfields, op->inputfields,
832                                      opmagma->evecs, opmagma->qdata, opmagma->qdata_alloc,
833                                      opmagma->indata, 0, 0, 0,
834                                      qf->numinputfields, Q); CeedChk(ierr);
835 
836   // Outfields
837   ierr = CeedOperatorSetupFields_Magma(qf->outputfields, op->outputfields,
838                                      opmagma->evecs, opmagma->qdata, opmagma->qdata_alloc,
839                                      opmagma->indata, qf->numinputfields, opmagma->numein,
840                                      opmagma->numqin, qf->numoutputfields, Q); CeedChk(ierr);
841 
842   // Output Qvecs
843   for (CeedInt i=0; i<qf->numoutputfields; i++) {
844     CeedEvalMode emode = qf->outputfields[i].emode;
845     if (emode != CEED_EVAL_NONE) {
846       opmagma->outdata[i] =  opmagma->qdata[i + qf->numinputfields];
847     }
848   }
849 
850   op->setupdone = 1;
851 
852   return 0;
853 }
854 
855 static int CeedOperatorApply_Magma(CeedOperator op, CeedVector invec,
856                                  CeedVector outvec, CeedRequest *request) {
857   CeedOperator_Magma *opmagma = op->data;
858   CeedInt Q = op->numqpoints, elemsize;
859   int ierr;
860   CeedQFunction qf = op->qf;
861   CeedTransposeMode lmode = CEED_NOTRANSPOSE;
862   CeedScalar *vec_temp;
863 
864   // Setup
865   ierr = CeedOperatorSetup_Magma(op); CeedChk(ierr);
866 
867   // Input Evecs and Restriction
868   for (CeedInt i=0,iein=0; i<qf->numinputfields; i++) {
869     // No Restriction
870     if (op->inputfields[i].Erestrict == CEED_RESTRICTION_IDENTITY) {
871       CeedEvalMode emode = qf->inputfields[i].emode;
872       if (emode & CEED_EVAL_WEIGHT) {
873       } else {
874         // Active
875         if (op->inputfields[i].vec == CEED_VECTOR_ACTIVE) {
876           ierr = CeedVectorGetArrayRead(invec, CEED_MEM_HOST,
877                                         (const CeedScalar **) &opmagma->edata[i]); CeedChk(ierr);
878           // Passive
879         } else {
880           ierr = CeedVectorGetArrayRead(op->inputfields[i].vec, CEED_MEM_HOST,
881                                         (const CeedScalar **) &opmagma->edata[i]); CeedChk(ierr);
882         }
883       }
884     } else {
885       // Restriction
886       // Zero evec
887       ierr = CeedVectorGetArray(opmagma->evecs[iein], CEED_MEM_HOST, &vec_temp);
888       CeedChk(ierr);
889       for (CeedInt j=0; j<opmagma->evecs[iein]->length; j++)
890         vec_temp[j] = 0.;
891       ierr = CeedVectorRestoreArray(opmagma->evecs[iein], &vec_temp); CeedChk(ierr);
892       // Active
893       if (op->inputfields[i].vec == CEED_VECTOR_ACTIVE) {
894         // Restrict
895         ierr = CeedElemRestrictionApply(op->inputfields[i].Erestrict, CEED_NOTRANSPOSE,
896                                         lmode, invec, opmagma->evecs[iein],
897                                         request); CeedChk(ierr);
898         // Get evec
899         ierr = CeedVectorGetArrayRead(opmagma->evecs[iein], CEED_MEM_HOST,
900                                       (const CeedScalar **) &opmagma->edata[i]); CeedChk(ierr);
901         iein++;
902       } else {
903         // Passive
904         // Restrict
905         ierr = CeedElemRestrictionApply(op->inputfields[i].Erestrict, CEED_NOTRANSPOSE,
906                                         lmode, op->inputfields[i].vec, opmagma->evecs[iein],
907                                         request); CeedChk(ierr);
908         // Get evec
909         ierr = CeedVectorGetArrayRead(opmagma->evecs[iein], CEED_MEM_HOST,
910                                       (const CeedScalar **) &opmagma->edata[i]); CeedChk(ierr);
911         iein++;
912       }
913     }
914   }
915 
916   // Output Evecs
917   for (CeedInt i=0,ieout=opmagma->numein; i<qf->numoutputfields; i++) {
918     // No Restriction
919     if (op->outputfields[i].Erestrict == CEED_RESTRICTION_IDENTITY) {
920       // Active
921       if (op->outputfields[i].vec == CEED_VECTOR_ACTIVE) {
922         ierr = CeedVectorGetArray(outvec, CEED_MEM_HOST,
923                                   &opmagma->edata[i + qf->numinputfields]); CeedChk(ierr);
924       } else {
925         // Passive
926         ierr = CeedVectorGetArray(op->outputfields[i].vec, CEED_MEM_HOST,
927                                   &opmagma->edata[i + qf->numinputfields]); CeedChk(ierr);
928       }
929     } else {
930       // Restriction
931       ierr = CeedVectorGetArray(opmagma->evecs[ieout], CEED_MEM_HOST,
932                                 &opmagma->edata[i + qf->numinputfields]); CeedChk(ierr);
933       ieout++;
934     }
935   }
936 
937   // Loop through elements
938   for (CeedInt e=0; e<op->numelements; e++) {
939     // Input basis apply if needed
940     for (CeedInt i=0; i<qf->numinputfields; i++) {
941       // Get elemsize
942       if (op->inputfields[i].Erestrict != CEED_RESTRICTION_IDENTITY) {
943         elemsize = op->inputfields[i].Erestrict->elemsize;
944       } else {
945         elemsize = Q;
946       }
947       // Get emode, ncomp
948       CeedEvalMode emode = qf->inputfields[i].emode;
949       CeedInt ncomp = qf->inputfields[i].ncomp;
950       // Basis action
951       switch(emode) {
952       case CEED_EVAL_NONE:
953         opmagma->indata[i] = &opmagma->edata[i][e*Q*ncomp];
954         break;
955       case CEED_EVAL_INTERP:
956         ierr = CeedBasisApply(op->inputfields[i].basis, 1, CEED_NOTRANSPOSE,
957                               CEED_EVAL_INTERP, &opmagma->edata[i][e*elemsize*ncomp], opmagma->qdata[i]);
958         CeedChk(ierr);
959         opmagma->indata[i] = opmagma->qdata[i];
960         break;
961       case CEED_EVAL_GRAD:
962         ierr = CeedBasisApply(op->inputfields[i].basis, 1, CEED_NOTRANSPOSE,
963                               CEED_EVAL_GRAD, &opmagma->edata[i][e*elemsize*ncomp], opmagma->qdata[i]);
964         CeedChk(ierr);
965         opmagma->indata[i] = opmagma->qdata[i];
966         break;
967       case CEED_EVAL_WEIGHT:
968         break;  // No action
969       case CEED_EVAL_DIV:
970         break; // Not implimented
971       case CEED_EVAL_CURL:
972         break; // Not implimented
973       }
974     }
975     // Output pointers
976     for (CeedInt i=0; i<qf->numoutputfields; i++) {
977       CeedEvalMode emode = qf->outputfields[i].emode;
978       if (emode == CEED_EVAL_NONE) {
979         CeedInt ncomp = qf->outputfields[i].ncomp;
980         opmagma->outdata[i] = &opmagma->edata[i + qf->numinputfields][e*Q*ncomp];
981       }
982     }
983     // Q function
984     ierr = CeedQFunctionApply(op->qf, Q, (const CeedScalar * const*) opmagma->indata,
985                               opmagma->outdata); CeedChk(ierr);
986 
987     // Output basis apply if needed
988     for (CeedInt i=0; i<qf->numoutputfields; i++) {
989       // Get elemsize
990       if (op->outputfields[i].Erestrict != CEED_RESTRICTION_IDENTITY) {
991         elemsize = op->outputfields[i].Erestrict->elemsize;
992       } else {
993         elemsize = Q;
994       }
995       // Get emode, ncomp
996       CeedInt ncomp = qf->outputfields[i].ncomp;
997       CeedEvalMode emode = qf->outputfields[i].emode;
998       // Basis action
999       switch(emode) {
1000       case CEED_EVAL_NONE:
1001         break; // No action
1002       case CEED_EVAL_INTERP:
1003         ierr = CeedBasisApply(op->outputfields[i].basis, 1, CEED_TRANSPOSE,
1004                               CEED_EVAL_INTERP, opmagma->outdata[i],
1005                               &opmagma->edata[i + qf->numinputfields][e*elemsize*ncomp]); CeedChk(ierr);
1006         break;
1007       case CEED_EVAL_GRAD:
1008         ierr = CeedBasisApply(op->outputfields[i].basis, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD,
1009                               opmagma->outdata[i], &opmagma->edata[i + qf->numinputfields][e*elemsize*ncomp]);
1010         CeedChk(ierr);
1011         break;
1012       case CEED_EVAL_WEIGHT:
1013         break; // Should not occur
1014       case CEED_EVAL_DIV:
1015         break; // Not implimented
1016       case CEED_EVAL_CURL:
1017         break; // Not implimented
1018       }
1019     }
1020   }
1021 
1022   // Output restriction
1023   for (CeedInt i=0,ieout=opmagma->numein; i<qf->numoutputfields; i++) {
1024     // No Restriction
1025     if (op->outputfields[i].Erestrict == CEED_RESTRICTION_IDENTITY) {
1026       // Active
1027       if (op->outputfields[i].vec == CEED_VECTOR_ACTIVE) {
1028         ierr = CeedVectorRestoreArray(outvec, &opmagma->edata[i + qf->numinputfields]);
1029         CeedChk(ierr);
1030       } else {
1031         // Passive
1032         ierr = CeedVectorRestoreArray(op->outputfields[i].vec,
1033                                       &opmagma->edata[i + qf->numinputfields]); CeedChk(ierr);
1034       }
1035     } else {
1036       // Restriction
1037       // Active
1038       if (op->outputfields[i].vec == CEED_VECTOR_ACTIVE) {
1039         // Restore evec
1040         ierr = CeedVectorRestoreArray(opmagma->evecs[ieout],
1041                                       &opmagma->edata[i + qf->numinputfields]); CeedChk(ierr);
1042         // Zero lvec
1043         ierr = CeedVectorGetArray(outvec, CEED_MEM_HOST, &vec_temp); CeedChk(ierr);
1044         for (CeedInt j=0; j<outvec->length; j++)
1045           vec_temp[j] = 0.;
1046         ierr = CeedVectorRestoreArray(outvec, &vec_temp); CeedChk(ierr);
1047         // Restrict
1048         ierr = CeedElemRestrictionApply(op->outputfields[i].Erestrict, CEED_TRANSPOSE,
1049                                         lmode, opmagma->evecs[ieout], outvec, request); CeedChk(ierr);
1050         ieout++;
1051       } else {
1052         // Passive
1053         // Restore evec
1054         ierr = CeedVectorRestoreArray(opmagma->evecs[ieout],
1055                                       &opmagma->edata[i + qf->numinputfields]); CeedChk(ierr);
1056         // Zero lvec
1057         ierr = CeedVectorGetArray(op->outputfields[i].vec, CEED_MEM_HOST, &vec_temp);
1058         CeedChk(ierr);
1059         for (CeedInt j=0; j<op->outputfields[i].vec->length; j++)
1060           vec_temp[j] = 0.;
1061         ierr = CeedVectorRestoreArray(op->outputfields[i].vec, &vec_temp);
1062         CeedChk(ierr);
1063         // Restrict
1064         ierr = CeedElemRestrictionApply(op->outputfields[i].Erestrict, CEED_TRANSPOSE,
1065                                         lmode, opmagma->evecs[ieout], op->outputfields[i].vec,
1066                                         request); CeedChk(ierr);
1067         ieout++;
1068       }
1069     }
1070   }
1071 
1072   // Restore input arrays
1073   for (CeedInt i=0,iein=0; i<qf->numinputfields; i++) {
1074     // No Restriction
1075     if (op->inputfields[i].Erestrict == CEED_RESTRICTION_IDENTITY) {
1076       CeedEvalMode emode = qf->inputfields[i].emode;
1077       if (emode & CEED_EVAL_WEIGHT) {
1078       } else {
1079         // Active
1080         if (op->inputfields[i].vec == CEED_VECTOR_ACTIVE) {
1081           ierr = CeedVectorRestoreArrayRead(invec,
1082                                             (const CeedScalar **) &opmagma->edata[i]); CeedChk(ierr);
1083           // Passive
1084         } else {
1085           ierr = CeedVectorRestoreArrayRead(op->inputfields[i].vec,
1086                                             (const CeedScalar **) &opmagma->edata[i]); CeedChk(ierr);
1087         }
1088       }
1089     } else {
1090       // Restriction
1091       ierr = CeedVectorRestoreArrayRead(opmagma->evecs[iein],
1092                                         (const CeedScalar **) &opmagma->edata[i]); CeedChk(ierr);
1093       iein++;
1094     }
1095   }
1096 
1097   return 0;
1098 }
1099 
1100 static int CeedOperatorCreate_Magma(CeedOperator op) {
1101   CeedOperator_Magma *impl;
1102   int ierr;
1103 
1104   ierr = CeedCalloc(1, &impl); CeedChk(ierr);
1105   op->data = impl;
1106   op->Destroy  = CeedOperatorDestroy_Magma;
1107   op->Apply    = CeedOperatorApply_Magma;
1108   return 0;
1109 }
1110 
1111 // *****************************************************************************
1112 // * INIT
1113 // *****************************************************************************
1114 static int CeedInit_Magma(const char *resource, Ceed ceed) {
1115   int ierr;
1116   if (strcmp(resource, "/gpu/magma"))
1117     return CeedError(ceed, 1, "MAGMA backend cannot use resource: %s", resource);
1118 
1119   ierr = magma_init();
1120   if (ierr) return CeedError(ceed, 1, "error in magma_init(): %d\n", ierr);
1121   //magma_print_environment();
1122 
1123   ceed->VecCreate = CeedVectorCreate_Magma;
1124   ceed->BasisCreateTensorH1 = CeedBasisCreateTensorH1_Magma;
1125   ceed->ElemRestrictionCreate = CeedElemRestrictionCreate_Magma;
1126   ceed->QFunctionCreate = CeedQFunctionCreate_Magma;
1127   ceed->OperatorCreate = CeedOperatorCreate_Magma;
1128   return 0;
1129 }
1130 
1131 // *****************************************************************************
1132 // * REGISTER
1133 // *****************************************************************************
1134 __attribute__((constructor))
1135 static void Register(void) {
1136   CeedRegister("/gpu/magma", CeedInit_Magma, 20);
1137 }
1138