xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma.c (revision 5a9ca9ad5fd061dce90f8686460d79eaf35523f3)
1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
3 // reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include "ceed-magma.h"
18 #include <string.h>
19 
20 typedef struct {
21   CeedScalar *array;
22   CeedScalar *darray;
23   int  own_;
24   int down_;
25 } CeedVector_Magma;
26 
27 typedef struct {
28   CeedInt *indices;
29   CeedInt *dindices;
30   int  own_;
31   int down_;            // cover a case where we own Device memory
32 } CeedElemRestriction_Magma;
33 
34 typedef struct {
35   CeedVector etmp;
36   CeedVector qdata;
37 } CeedOperator_Magma;
38 
39 // *****************************************************************************
40 // * Initialize vector vec (after free mem) with values from array based on cmode
41 // *   CEED_COPY_VALUES: memory is allocated in vec->array_allocated, made equal
42 // *                     to array, and data is copied (not store passed pointer)
43 // *   CEED_OWN_POINTER: vec->data->array_allocated and vec->data->array = array
44 // *   CEED_USE_POINTER: vec->data->array = array (can modify; no ownership)
45 // * mtype: CEED_MEM_HOST or CEED_MEM_DEVICE
46 // *****************************************************************************
47 static int CeedVectorSetArray_Magma(CeedVector vec, CeedMemType mtype,
48                                     CeedCopyMode cmode, CeedScalar *array) {
49   CeedVector_Magma *impl = vec->data;
50   int ierr;
51 
52   // If own data, free the "old" data, e.g., as it may be of different size
53   if (impl->own_) {
54     magma_free( impl->darray );
55     magma_free_pinned( impl->array );
56     impl->darray = NULL;
57     impl->array  = NULL;
58     impl->own_ = 0;
59     impl->down_= 0;
60   }
61 
62   if (mtype == CEED_MEM_HOST) {
63     // memory is on the host; own_ = 0
64     switch (cmode) {
65     case CEED_COPY_VALUES:
66       ierr = magma_malloc( (void**)&impl->darray,
67                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
68       ierr = magma_malloc_pinned( (void**)&impl->array,
69                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
70       impl->own_ = 1;
71 
72       if (array != NULL)
73         magma_setvector(vec->length, sizeof(array[0]),
74                         array, 1, impl->darray, 1);
75       break;
76     case CEED_OWN_POINTER:
77       ierr = magma_malloc( (void**)&impl->darray,
78                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
79       // TODO: possible problem here is if we are passed non-pinned memory;
80       //       (as we own it, lter in destroy, we use free for pinned memory).
81       impl->array = array;
82       impl->own_ = 1;
83 
84       if (array != NULL)
85         magma_setvector(vec->length, sizeof(array[0]),
86                         array, 1, impl->darray, 1);
87       break;
88     case CEED_USE_POINTER:
89       ierr = magma_malloc( (void**)&impl->darray,
90                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
91       magma_setvector(vec->length, sizeof(array[0]),
92                       array, 1, impl->darray, 1);
93       impl->array  = array;
94     }
95   } else if (mtype == CEED_MEM_DEVICE) {
96     // memory is on the device; own = 0
97     switch (cmode) {
98     case CEED_COPY_VALUES:
99       ierr = magma_malloc( (void**)&impl->darray,
100                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
101       ierr = magma_malloc_pinned( (void**)&impl->array,
102                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
103       impl->own_ = 1;
104 
105       if (array)
106         magma_copyvector(vec->length, sizeof(array[0]),
107                          array, 1, impl->darray, 1);
108       break;
109     case CEED_OWN_POINTER:
110       impl->darray = array;
111       ierr = magma_malloc_pinned( (void**)&impl->array,
112                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
113       impl->own_ = 1;
114 
115       break;
116     case CEED_USE_POINTER:
117       impl->darray = array;
118       impl->array  = NULL;
119     }
120 
121   } else
122     return CeedError(vec->ceed, 1, "Only MemType = HOST or DEVICE supported");
123 
124   return 0;
125 }
126 
127 // *****************************************************************************
128 // * Give data pointer from vector vec to array (on HOST or DEVICE)
129 // *****************************************************************************
130 static int CeedVectorGetArray_Magma(CeedVector vec, CeedMemType mtype,
131                                     CeedScalar **array) {
132   CeedVector_Magma *impl = vec->data;
133   int ierr;
134 
135   if (mtype == CEED_MEM_HOST) {
136     if (impl->own_) {
137       // data is owned so GPU had the most up-to-date version; copy it
138       magma_getvector(vec->length, sizeof(*array[0]),
139                       impl->darray, 1, impl->array, 1);
140     } else if (impl->array == NULL) {
141       // Vector doesn't own the data and was set on GPU
142       if (impl->darray == NULL) {
143         // call was made just to allocate memory
144         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
145         CeedChk(ierr);
146       } else
147         return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
148     }
149     *array = impl->array;
150   } else if (mtype == CEED_MEM_DEVICE) {
151     if (impl->darray == NULL) {
152       // Vector doesn't own the data and was set on the CPU
153       if (impl->array == NULL) {
154         // call was made just to allocate memory
155         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
156         CeedChk(ierr);
157       } else
158         return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
159     }
160     *array = impl->darray;
161   } else
162     return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
163 
164   return 0;
165 }
166 
167 // *****************************************************************************
168 // * Give data pointer from vector vec to array (on HOST or DEVICE) to read it
169 // *****************************************************************************
170 static int CeedVectorGetArrayRead_Magma(CeedVector vec, CeedMemType mtype,
171                                         const CeedScalar **array) {
172   CeedVector_Magma *impl = vec->data;
173   int ierr;
174 
175   if (mtype == CEED_MEM_HOST) {
176     if (impl->own_) {
177       // data is owned so GPU had the most up-to-date version; copy it
178       magma_getvector(vec->length, sizeof(*array[0]),
179                       impl->darray, 1, impl->array, 1);
180     } else if (impl->array == NULL) {
181       // Vector doesn't own the data and was set on GPU
182       if (impl->darray == NULL) {
183         // call was made just to allocate memory
184         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
185         CeedChk(ierr);
186       } else
187         return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
188     }
189     *array = impl->array;
190   } else if (mtype == CEED_MEM_DEVICE) {
191     if (impl->darray == NULL) {
192       // Vector doesn't own the data and was set on the CPU
193       if (impl->array == NULL) {
194         // call was made just to allocate memory
195         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
196         CeedChk(ierr);
197       } else
198         return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
199     }
200     *array = impl->darray;
201   } else
202     return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
203 
204   return 0;
205 }
206 
207 // *****************************************************************************
208 // * There is no mtype here for array so it is not clear if we restore from HOST
209 // * memory or from DEVICE memory. We assume that it is CPU memory because if
210 // * it was GPU memory we would not call this routine at all.
211 // * Restore vector vec with values from array, where array received its values
212 // * from vec and possibly modified them.
213 // *****************************************************************************
214 static int CeedVectorRestoreArray_Magma(CeedVector vec, CeedScalar **array) {
215   CeedVector_Magma *impl = vec->data;
216 
217   // Check if the array is a CPU pointer
218   if (*array == impl->array) {
219     // Update device, if the device pointer is not NULL
220     if (impl->darray != NULL) {
221       magma_setvector(vec->length, sizeof(*array[0]),
222                       *array, 1, impl->darray, 1);
223     } else {
224       // nothing to do (case of CPU use pointer)
225     }
226 
227   } else if (impl->down_) {
228     // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
229     magma_getvector(vec->length, sizeof(*array[0]),
230                     impl->darray, 1, impl->array, 1);
231   }
232 
233   *array = NULL;
234   return 0;
235 }
236 
237 // *****************************************************************************
238 // * There is no mtype here for array so it is not clear if we restore from HOST
239 // * memory or from DEVICE memory. We assume that it is CPU memory because if
240 // * it was GPU memory we would not call this routine at all.
241 // * Restore vector vec with values from array, where array received its values
242 // * from vec to only read them; in this case vec may have been modified meanwhile
243 // * and needs to be restored here.
244 // *****************************************************************************
245 static int CeedVectorRestoreArrayRead_Magma(CeedVector vec,
246     const CeedScalar **array) {
247   CeedVector_Magma *impl = vec->data;
248 
249   // Check if the array is a CPU pointer
250   if (*array == impl->array) {
251     // Update device, if the device pointer is not NULL
252     if (impl->darray != NULL) {
253       magma_setvector(vec->length, sizeof(*array[0]),
254                       *array, 1, impl->darray, 1);
255     } else {
256       // nothing to do (case of CPU use pointer)
257     }
258 
259   } else if (impl->down_) {
260     // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
261     magma_getvector(vec->length, sizeof(*array[0]),
262                     impl->darray, 1, impl->array, 1);
263   }
264 
265   *array = NULL;
266   return 0;
267 }
268 
269 static int CeedVectorDestroy_Magma(CeedVector vec) {
270   CeedVector_Magma *impl = vec->data;
271   int ierr;
272 
273   // Free if we own the data
274   if (impl->own_) {
275     ierr = magma_free_pinned(impl->array); CeedChk(ierr);
276     ierr = magma_free(impl->darray);       CeedChk(ierr);
277   } else if (impl->down_) {
278     ierr = magma_free(impl->darray);       CeedChk(ierr);
279   }
280   ierr = CeedFree(&vec->data); CeedChk(ierr);
281   return 0;
282 }
283 
284 // *****************************************************************************
285 // * Create vector vec of size n
286 // *****************************************************************************
287 static int CeedVectorCreate_Magma(Ceed ceed, CeedInt n, CeedVector vec) {
288   CeedVector_Magma *impl;
289   int ierr;
290 
291   vec->SetArray = CeedVectorSetArray_Magma;
292   vec->GetArray = CeedVectorGetArray_Magma;
293   vec->GetArrayRead = CeedVectorGetArrayRead_Magma;
294   vec->RestoreArray = CeedVectorRestoreArray_Magma;
295   vec->RestoreArrayRead = CeedVectorRestoreArrayRead_Magma;
296   vec->Destroy = CeedVectorDestroy_Magma;
297   ierr = CeedCalloc(1,&impl); CeedChk(ierr);
298   impl->darray = NULL;
299   impl->array  = NULL;
300   impl->own_ = 0;
301   impl->down_= 0;
302   vec->data = impl;
303   return 0;
304 }
305 
306 // *****************************************************************************
307 // * Apply restriction operator r to u: v = r(rmode) u
308 // *****************************************************************************
309 static int CeedElemRestrictionApply_Magma(CeedElemRestriction r,
310     CeedTransposeMode tmode, CeedInt ncomp,
311     CeedTransposeMode lmode, CeedVector u,
312     CeedVector v, CeedRequest *request) {
313   CeedElemRestriction_Magma *impl = r->data;
314   int ierr;
315   const CeedScalar *uu;
316   CeedScalar *vv;
317   CeedInt nelem = r->nelem, elemsize = r->elemsize, ndof = r->ndof;
318   CeedInt esize = nelem * elemsize;
319   // CeedInt *indices = impl->indices;
320   CeedInt *dindices = impl->dindices;
321 
322   //ierr = CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu); CeedChk(ierr);
323   //ierr = CeedVectorGetArray(v, CEED_MEM_HOST, &vv); CeedChk(ierr);
324 
325   // Get pointers on the device
326   ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &uu); CeedChk(ierr);
327   ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &vv); CeedChk(ierr);
328 
329   if (tmode == CEED_NOTRANSPOSE) {
330     // Perform: v = r * u
331     if (ncomp == 1) {
332       //for (CeedInt i=0; i<esize; i++) vv[i] = uu[indices[i]];
333 
334 magma_template<<i=0:esize>>
335       (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
336         vv[i] = uu[dindices[i]];
337       }
338     } else {
339       // vv is (elemsize x ncomp x nelem), column-major
340       if (lmode == CEED_NOTRANSPOSE) { // u is (ndof x ncomp), column-major
341         /*
342         for (CeedInt e = 0; e < nelem; e++)
343             for (CeedInt d = 0; d < ncomp; d++)
344                 for (CeedInt i=0; i < elemsize; i++) {
345                     vv[i + elemsize*(d+ncomp*e)] =
346                         uu[indices[i+elemsize*e]+ndof*d];
347                 }
348         */
349 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
350         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, int ndof) {
351           vv[i + iend*(d+dend*e)] = uu[dindices[i+iend*e]+ndof*d];
352         }
353       } else { // u is (ncomp x ndof), column-major
354         /*
355         for (CeedInt e = 0; e < nelem; e++)
356             for (CeedInt d = 0; d < ncomp; d++)
357                 for (CeedInt i=0; i< elemsize; i++) {
358                     vv[i + elemsize*(d+ncomp*e)] =
359                         uu[d+ncomp*indices[i+elemsize*e]];
360                 }
361         */
362 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
363         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
364           vv[i + iend*(d+dend*e)] = uu[d+dend*dindices[i + iend*e]];
365         }
366       }
367     }
368   } else {
369     // Note: in transpose mode, we perform: v += r^t * u
370     if (ncomp == 1) {
371       // fprintf(stderr,"3 ---------\n");
372       // for (CeedInt i=0; i<esize; i++) vv[indices[i]] += uu[i];
373 magma_template<<i=0:esize>>
374       (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
375         magmablas_datomic_add( &vv[dindices[i]], uu[i]);
376       }
377     } else { // u is (elemsize x ncomp x nelem)
378       fprintf(stderr,"2 ---------\n");
379 
380       if (lmode == CEED_NOTRANSPOSE) { // vv is (ndof x ncomp), column-major
381         /*
382         for (CeedInt e = 0; e < nelem; e++)
383             for (CeedInt d = 0; d < ncomp; d++)
384                 for (CeedInt i=0; i < elemsize; i++) {
385                     vv[indices[i + elemsize*e]+ndof*d] +=
386                         uu[i + elemsize*(d+e*ncomp)];
387                 }
388         */
389 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
390         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, CeedInt ndof) {
391           magmablas_datomic_add( &vv[dindices[i+iend*e]+ndof*d],
392                                  uu[i+iend*(d+e*dend)]);
393         }
394       } else { // vv is (ncomp x ndof), column-major
395         /*
396         for (CeedInt e = 0; e < nelem; e++)
397             for (CeedInt d = 0; d < ncomp; d++)
398                 for (CeedInt i=0; i < elemsize; i++) {
399                     vv[d+ncomp*indices[i + elemsize*e]] +=
400                         uu[i + elemsize*(d+e*ncomp)];
401                 }
402         */
403 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
404         (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
405           magmablas_datomic_add( &vv[d+dend*dindices[i + iend*e]],
406                                  uu[i+iend*(d+e*dend)]);
407         }
408       }
409     }
410   }
411   ierr = CeedVectorRestoreArrayRead(u, &uu); CeedChk(ierr);
412   ierr = CeedVectorRestoreArray(v, &vv); CeedChk(ierr);
413 
414   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
415     *request = NULL;
416   return 0;
417 }
418 
419 static int CeedElemRestrictionDestroy_Magma(CeedElemRestriction r) {
420   CeedElemRestriction_Magma *impl = r->data;
421   int ierr;
422 
423   // Free if we own the data
424   if (impl->own_) {
425     ierr = magma_free_pinned(impl->indices); CeedChk(ierr);
426     ierr = magma_free(impl->dindices);       CeedChk(ierr);
427   } else if (impl->down_) {
428     ierr = magma_free(impl->dindices);       CeedChk(ierr);
429   }
430   ierr = CeedFree(&r->data); CeedChk(ierr);
431   return 0;
432 }
433 
434 static int CeedElemRestrictionCreate_Magma(CeedElemRestriction r,
435     CeedMemType mtype,
436     CeedCopyMode cmode,
437     const CeedInt *indices) {
438   int ierr, size = r->nelem*r->elemsize;
439   CeedElemRestriction_Magma *impl;
440 
441   // Allocate memory for the MAGMA Restricton and initializa pointers to NULL
442   ierr = CeedCalloc(1,&impl); CeedChk(ierr);
443   impl->dindices = NULL;
444   impl->indices  = NULL;
445   impl->own_ = 0;
446   impl->down_= 0;
447 
448   if (mtype == CEED_MEM_HOST) {
449     // memory is on the host; own_ = 0
450     switch (cmode) {
451     case CEED_COPY_VALUES:
452       ierr = magma_malloc( (void**)&impl->dindices,
453                            size * sizeof(CeedInt)); CeedChk(ierr);
454       ierr = magma_malloc_pinned( (void**)&impl->indices,
455                                   size * sizeof(CeedInt)); CeedChk(ierr);
456       impl->own_ = 1;
457 
458       if (indices != NULL)
459         magma_setvector(size, sizeof(CeedInt),
460                         indices, 1, impl->dindices, 1);
461       break;
462     case CEED_OWN_POINTER:
463       ierr = magma_malloc( (void**)&impl->dindices,
464                            size * sizeof(CeedInt)); CeedChk(ierr);
465       // TODO: possible problem here is if we are passed non-pinned memory;
466       //       (as we own it, lter in destroy, we use free for pinned memory).
467       impl->indices = (CeedInt *)indices;
468       impl->own_ = 1;
469 
470       if (indices != NULL)
471         magma_setvector(size, sizeof(CeedInt),
472                         indices, 1, impl->dindices, 1);
473       break;
474     case CEED_USE_POINTER:
475       ierr = magma_malloc( (void**)&impl->dindices,
476                            size * sizeof(CeedInt)); CeedChk(ierr);
477       magma_setvector(size, sizeof(CeedInt),
478                       indices, 1, impl->dindices, 1);
479       impl->down_ = 1;
480       impl->indices  = (CeedInt *)indices;
481     }
482   } else if (mtype == CEED_MEM_DEVICE) {
483     // memory is on the device; own = 0
484     switch (cmode) {
485     case CEED_COPY_VALUES:
486       ierr = magma_malloc( (void**)&impl->dindices,
487                            size * sizeof(CeedInt)); CeedChk(ierr);
488       ierr = magma_malloc_pinned( (void**)&impl->indices,
489                                   size * sizeof(CeedInt)); CeedChk(ierr);
490       impl->own_ = 1;
491 
492       if (indices)
493         magma_copyvector(size, sizeof(CeedInt),
494                          indices, 1, impl->dindices, 1);
495       break;
496     case CEED_OWN_POINTER:
497       impl->dindices = (CeedInt *)indices;
498       ierr = magma_malloc_pinned( (void**)&impl->indices,
499                                   size * sizeof(CeedInt)); CeedChk(ierr);
500       impl->own_ = 1;
501 
502       break;
503     case CEED_USE_POINTER:
504       impl->dindices = (CeedInt *)indices;
505       impl->indices  = NULL;
506     }
507 
508   } else
509     return CeedError(r->ceed, 1, "Only MemType = HOST or DEVICE supported");
510 
511   r->data    = impl;
512   r->Apply   = CeedElemRestrictionApply_Magma;
513   r->Destroy = CeedElemRestrictionDestroy_Magma;
514 
515   return 0;
516 }
517 
518 // Contracts on the middle index
519 // NOTRANSPOSE: V_ajc = T_jb U_abc
520 // TRANSPOSE:   V_ajc = T_bj U_abc
521 // If Add != 0, "=" is replaced by "+="
522 static int CeedTensorContract_Magma(Ceed ceed,
523                                     CeedInt A, CeedInt B, CeedInt C, CeedInt J,
524                                     const CeedScalar *t, CeedTransposeMode tmode,
525                                     const CeedInt Add,
526                                     const CeedScalar *u, CeedScalar *v) {
527 #ifdef USE_MAGMA_BATCH
528   magma_dtensor_contract(ceed, A, B, C, J, t, tmode, Add, u, v);
529 #else
530   CeedInt tstride0 = B, tstride1 = 1;
531   if (tmode == CEED_TRANSPOSE) {
532     tstride0 = 1; tstride1 = J;
533   }
534   CeedDebug("\033[31m[CeedTensorContract] A=%d, J=%d, C=%d, B=%d: %d %d %d",
535             A,J,C,B,A*J*B*C, C*J*A, C*B*A);
536   for (CeedInt a=0; a<A; a++) {
537     for (CeedInt j=0; j<J; j++) {
538       if (!Add) {
539         for (CeedInt c=0; c<C; c++)
540           v[(a*J+j)*C+c] = 0;
541       }
542       for (CeedInt b=0; b<B; b++) {
543         for (CeedInt c=0; c<C; c++) {
544           v[(a*J+j)*C+c] += t[j*tstride0 + b*tstride1] * u[(a*B+b)*C+c];
545         }
546       }
547     }
548   }
549 #endif
550   return 0;
551 }
552 
553 static int CeedBasisApply_Magma(CeedBasis basis, CeedTransposeMode tmode,
554                                 CeedEvalMode emode,
555                                 const CeedScalar *u, CeedScalar *v) {
556   int ierr;
557   const CeedInt dim = basis->dim;
558   const CeedInt ndof = basis->ndof;
559   const CeedInt nqpt = ndof*CeedPowInt(basis->Q1d, dim);
560   const CeedInt add = (tmode == CEED_TRANSPOSE);
561 
562   CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d",
563             ndof*CeedPowInt(basis->P1d, dim));
564 
565   if (tmode == CEED_TRANSPOSE) {
566     const CeedInt vsize = ndof*CeedPowInt(basis->P1d, dim);
567     for (CeedInt i = 0; i < vsize; i++)
568       v[i] = (CeedScalar) 0;
569   }
570   if (emode & CEED_EVAL_INTERP) {
571     CeedInt P = basis->P1d, Q = basis->Q1d;
572     if (tmode == CEED_TRANSPOSE) {
573       P = basis->Q1d; Q = basis->P1d;
574     }
575     CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1;
576     CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)];
577     CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
578               ndof*Q*CeedPowInt(P>Q?P:Q, dim-1));
579     for (CeedInt d=0; d<dim; d++) {
580       ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, basis->interp1d,
581                                       tmode, add&&(d==dim-1),
582                                       d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
583       CeedChk(ierr);
584       pre /= P;
585       post *= Q;
586     }
587     if (tmode == CEED_NOTRANSPOSE) {
588       v += nqpt;
589     } else {
590       u += nqpt;
591     }
592   }
593   if (emode & CEED_EVAL_GRAD) {
594     CeedInt P = basis->P1d, Q = basis->Q1d;
595     // In CEED_NOTRANSPOSE mode:
596     // u is (P^dim x nc), column-major layout (nc = ndof)
597     // v is (Q^dim x nc x dim), column-major layout (nc = ndof)
598     // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
599     if (tmode == CEED_TRANSPOSE) {
600       P = basis->Q1d, Q = basis->P1d;
601     }
602     CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)];
603     CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
604               ndof*Q*CeedPowInt(P>Q?P:Q, dim-1));
605     for (CeedInt p = 0; p < dim; p++) {
606       CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1;
607       for (CeedInt d=0; d<dim; d++) {
608         ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q,
609                                         (p==d)?basis->grad1d:basis->interp1d,
610                                         tmode, add&&(d==dim-1),
611                                         d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
612         CeedChk(ierr);
613         pre /= P;
614         post *= Q;
615       }
616       if (tmode == CEED_NOTRANSPOSE) {
617         v += nqpt;
618       } else {
619         u += nqpt;
620       }
621     }
622   }
623   if (emode & CEED_EVAL_WEIGHT) {
624     if (tmode == CEED_TRANSPOSE)
625       return CeedError(basis->ceed, 1,
626                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
627     CeedInt Q = basis->Q1d;
628     for (CeedInt d=0; d<dim; d++) {
629       CeedInt pre = CeedPowInt(Q, dim-d-1), post = CeedPowInt(Q, d);
630       for (CeedInt i=0; i<pre; i++) {
631         for (CeedInt j=0; j<Q; j++) {
632           for (CeedInt k=0; k<post; k++) {
633             v[(i*Q + j)*post + k] = basis->qweight1d[j]
634                                     * (d == 0 ? 1 : v[(i*Q + j)*post + k]);
635           }
636         }
637       }
638     }
639   }
640   return 0;
641 }
642 
643 static int CeedBasisDestroy_Magma(CeedBasis basis) {
644   return 0;
645 }
646 
647 static int CeedBasisCreateTensorH1_Magma(Ceed ceed, CeedInt dim, CeedInt P1d,
648     CeedInt Q1d, const CeedScalar *interp1d,
649     const CeedScalar *grad1d,
650     const CeedScalar *qref1d,
651     const CeedScalar *qweight1d,
652     CeedBasis basis) {
653   basis->Apply = CeedBasisApply_Magma;
654   basis->Destroy = CeedBasisDestroy_Magma;
655   return 0;
656 }
657 
658 static int CeedQFunctionApply_Magma(CeedQFunction qf, void *qdata, CeedInt Q,
659                                     const CeedScalar *const *u,
660                                     CeedScalar *const *v) {
661   int ierr;
662   ierr = qf->function(qf->ctx, qdata, Q, u, v); CeedChk(ierr);
663   return 0;
664 }
665 
666 static int CeedQFunctionDestroy_Magma(CeedQFunction qf) {
667   return 0;
668 }
669 
670 static int CeedQFunctionCreate_Magma(CeedQFunction qf) {
671   qf->Apply = CeedQFunctionApply_Magma;
672   qf->Destroy = CeedQFunctionDestroy_Magma;
673   return 0;
674 }
675 
676 static int CeedOperatorDestroy_Magma(CeedOperator op) {
677   CeedOperator_Magma *impl = op->data;
678   int ierr;
679 
680   ierr = CeedVectorDestroy(&impl->etmp); CeedChk(ierr);
681   ierr = CeedVectorDestroy(&impl->qdata); CeedChk(ierr);
682   ierr = CeedFree(&op->data); CeedChk(ierr);
683   return 0;
684 }
685 
686 static int CeedOperatorApply_Magma(CeedOperator op, CeedVector qdata,
687                                    CeedVector ustate,
688                                    CeedVector residual, CeedRequest *request) {
689   CeedOperator_Magma *impl = op->data;
690   CeedVector etmp;
691   CeedInt Q;
692   const CeedInt nc = op->basis->ndof, dim = op->basis->dim;
693   CeedScalar *Eu;
694   char *qd;
695   int ierr;
696   CeedTransposeMode lmode = CEED_NOTRANSPOSE;
697 
698   if (!impl->etmp) {
699     ierr = CeedVectorCreate(op->ceed,
700                             nc * op->Erestrict->nelem * op->Erestrict->elemsize,
701                             &impl->etmp); CeedChk(ierr);
702     // etmp is allocated when CeedVectorGetArray is called below
703   }
704   etmp = impl->etmp;
705   if (op->qf->inmode & ~CEED_EVAL_WEIGHT) {
706     ierr = CeedElemRestrictionApply(op->Erestrict, CEED_NOTRANSPOSE,
707                                     nc, lmode, ustate, etmp,
708                                     CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
709   }
710   ierr = CeedBasisGetNumQuadraturePoints(op->basis, &Q); CeedChk(ierr);
711   ierr = CeedVectorGetArray(etmp, CEED_MEM_HOST, &Eu); CeedChk(ierr);
712   ierr = CeedVectorGetArray(qdata, CEED_MEM_HOST, (CeedScalar**)&qd);
713   CeedChk(ierr);
714 
715   for (CeedInt e=0; e<op->Erestrict->nelem; e++) {
716     CeedScalar BEu[Q*nc*(dim+2)], BEv[Q*nc*(dim+2)], *out[5] = {0,0,0,0,0};
717     const CeedScalar *in[5] = {0,0,0,0,0};
718     // TODO: quadrature weights can be computed just once
719     CeedDebug("\033[11m[CeedOperatorApply_Magma] e=%d: Eu+%d, %d",
720               e, e*op->Erestrict->elemsize*nc, Q*nc*(dim+2));
721     ierr = CeedBasisApply(op->basis, CEED_NOTRANSPOSE, op->qf->inmode,
722                           &Eu[e*op->Erestrict->elemsize*nc], BEu);
723     CeedChk(ierr);
724     CeedScalar *u_ptr = BEu, *v_ptr = BEv;
725     if (op->qf->inmode & CEED_EVAL_INTERP) { in[0] = u_ptr; u_ptr += Q*nc; }
726     if (op->qf->inmode & CEED_EVAL_GRAD) { in[1] = u_ptr; u_ptr += Q*nc*dim; }
727     if (op->qf->inmode & CEED_EVAL_WEIGHT) { in[4] = u_ptr; u_ptr += Q; }
728     if (op->qf->outmode & CEED_EVAL_INTERP) { out[0] = v_ptr; v_ptr += Q*nc; }
729     if (op->qf->outmode & CEED_EVAL_GRAD) { out[1] = v_ptr; v_ptr += Q*nc*dim; }
730     ierr = CeedQFunctionApply(op->qf, &qd[e*Q*op->qf->qdatasize], Q, in, out);
731     CeedChk(ierr);
732     CeedDebug("\033[31m[CeedOperatorApply_Magma] e=%d: ",e);
733     ierr = CeedBasisApply(op->basis, CEED_TRANSPOSE, op->qf->outmode, BEv,
734                           &Eu[e*op->Erestrict->elemsize*nc]);
735     CeedChk(ierr);
736   }
737   ierr = CeedVectorRestoreArray(etmp, &Eu); CeedChk(ierr);
738   if (residual) {
739     CeedScalar *res;
740     CeedVectorGetArray(residual, CEED_MEM_HOST, &res);
741     for (int i = 0; i < residual->length; i++)
742       res[i] = (CeedScalar)0;
743     ierr = CeedElemRestrictionApply(op->Erestrict, CEED_TRANSPOSE,
744                                     nc, lmode, etmp, residual,
745                                     CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
746   }
747   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
748     *request = NULL;
749   return 0;
750 }
751 
752 static int CeedOperatorGetQData_Magma(CeedOperator op, CeedVector *qdata) {
753   CeedOperator_Magma *impl = op->data;
754   int ierr;
755 
756   if (!impl->qdata) {
757     CeedInt Q;
758     ierr = CeedBasisGetNumQuadraturePoints(op->basis, &Q); CeedChk(ierr);
759     ierr = CeedVectorCreate(op->ceed,
760                             op->Erestrict->nelem * Q
761                             * op->qf->qdatasize / sizeof(CeedScalar),
762                             &impl->qdata); CeedChk(ierr);
763   }
764   *qdata = impl->qdata;
765   return 0;
766 }
767 
768 static int CeedOperatorCreate_Magma(CeedOperator op) {
769   CeedOperator_Magma *impl;
770   int ierr;
771 
772   ierr = CeedCalloc(1, &impl); CeedChk(ierr);
773   op->data = impl;
774   op->Destroy  = CeedOperatorDestroy_Magma;
775   op->Apply    = CeedOperatorApply_Magma;
776   op->GetQData = CeedOperatorGetQData_Magma;
777   return 0;
778 }
779 
780 // *****************************************************************************
781 // * INIT
782 // *****************************************************************************
783 static int CeedInit_Magma(const char *resource, Ceed ceed) {
784   if (strcmp(resource, "/cpu/magma")
785       && strcmp(resource, "/gpu/magma"))
786     return CeedError(ceed, 1, "MAGMA backend cannot use resource: %s", resource);
787 
788   magma_init();
789   //magma_print_environment();
790 
791   ceed->VecCreate = CeedVectorCreate_Magma;
792   ceed->BasisCreateTensorH1 = CeedBasisCreateTensorH1_Magma;
793   ceed->ElemRestrictionCreate = CeedElemRestrictionCreate_Magma;
794   ceed->QFunctionCreate = CeedQFunctionCreate_Magma;
795   ceed->OperatorCreate = CeedOperatorCreate_Magma;
796   return 0;
797 }
798 
799 // *****************************************************************************
800 // * REGISTER
801 // *****************************************************************************
802 __attribute__((constructor))
803 static void Register(void) {
804   CeedRegister("/cpu/magma", CeedInit_Magma);
805   CeedRegister("/gpu/magma", CeedInit_Magma);
806 }
807