xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma.c (revision c119dad6925d1c62a7e8353ac8ac74008d2175a8)
1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
3 // reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed-impl.h>
18 #include <string.h>
19 #include "magma.h"
20 
21 typedef struct {
22     CeedScalar *array;
23     CeedScalar *darray;
24     int  own_;
25     int down_;
26 } CeedVector_Magma;
27 
28 typedef struct {
29     CeedInt *indices;
30     CeedInt *dindices;
31     int  own_;
32     int down_;            // cover a case where we own Device memory
33 } CeedElemRestriction_Magma;
34 
35 typedef struct {
36     CeedVector etmp;
37     CeedVector qdata;
38 } CeedOperator_Magma;
39 
40 // *****************************************************************************
41 // * Initialize vector vec (after free mem) with values from array based on cmode
42 // *   CEED_COPY_VALUES: memory is allocated in vec->array_allocated, made equal
43 // *                     to array, and data is copied (not store passed pointer)
44 // *   CEED_OWN_POINTER: vec->data->array_allocated and vec->data->array = array
45 // *   CEED_USE_POINTER: vec->data->array = array (can modify; no ownership)
46 // * mtype: CEED_MEM_HOST or CEED_MEM_DEVICE
47 // *****************************************************************************
48 static int CeedVectorSetArray_Magma(CeedVector vec, CeedMemType mtype,
49                                     CeedCopyMode cmode, CeedScalar *array) {
50     CeedVector_Magma *impl = vec->data;
51     int ierr;
52 
53     // If own data, free the "old" data, e.g., as it may be of different size
54     if (impl->own_){
55         magma_free( impl->darray );
56         magma_free_pinned( impl->array );
57         impl->darray = NULL;
58         impl->array  = NULL;
59         impl->own_ = 0;
60         impl->down_= 0;
61     }
62 
63     if (mtype == CEED_MEM_HOST) {
64         // memory is on the host; own_ = 0
65         switch (cmode) {
66         case CEED_COPY_VALUES:
67             ierr = magma_malloc( (void**)&impl->darray,
68                                  vec->length * sizeof(CeedScalar)); CeedChk(ierr);
69             ierr = magma_malloc_pinned( (void**)&impl->array,
70                                         vec->length * sizeof(CeedScalar)); CeedChk(ierr);
71             impl->own_ = 1;
72 
73             if (array != NULL)
74                 magma_setvector(vec->length, sizeof(array[0]),
75                                 array, 1, impl->darray, 1);
76             break;
77         case CEED_OWN_POINTER:
78             ierr = magma_malloc( (void**)&impl->darray,
79                                  vec->length * sizeof(CeedScalar)); CeedChk(ierr);
80             // TODO: possible problem here is if we are passed non-pinned memory;
81             //       (as we own it, lter in destroy, we use free for pinned memory).
82             impl->array = array;
83             impl->own_ = 1;
84 
85             if (array != NULL)
86                 magma_setvector(vec->length, sizeof(array[0]),
87                                 array, 1, impl->darray, 1);
88             break;
89         case CEED_USE_POINTER:
90             ierr = magma_malloc( (void**)&impl->darray,
91                                  vec->length * sizeof(CeedScalar)); CeedChk(ierr);
92             magma_setvector(vec->length, sizeof(array[0]),
93                             array, 1, impl->darray, 1);
94             impl->array  = array;
95         }
96     } else if (mtype == CEED_MEM_DEVICE) {
97         // memory is on the device; own = 0
98         switch (cmode) {
99         case CEED_COPY_VALUES:
100             ierr = magma_malloc( (void**)&impl->darray,
101                                 vec->length * sizeof(CeedScalar)); CeedChk(ierr);
102             ierr = magma_malloc_pinned( (void**)&impl->array,
103                                         vec->length * sizeof(CeedScalar)); CeedChk(ierr);
104             impl->own_ = 1;
105 
106             if (array)
107                 magma_copyvector(vec->length, sizeof(array[0]),
108                                  array, 1, impl->darray, 1);
109             break;
110         case CEED_OWN_POINTER:
111             impl->darray = array;
112             ierr = magma_malloc_pinned( (void**)&impl->array,
113                                         vec->length * sizeof(CeedScalar)); CeedChk(ierr);
114             impl->own_ = 1;
115 
116             break;
117         case CEED_USE_POINTER:
118             impl->darray = array;
119             impl->array  = NULL;
120         }
121 
122     } else
123         return CeedError(vec->ceed, 1, "Only MemType = HOST or DEVICE supported");
124 
125     return 0;
126 }
127 
128 // *****************************************************************************
129 // * Give data pointer from vector vec to array (on HOST or DEVICE)
130 // *****************************************************************************
131 static int CeedVectorGetArray_Magma(CeedVector vec, CeedMemType mtype,
132                                     CeedScalar **array) {
133     CeedVector_Magma *impl = vec->data;
134     int ierr;
135 
136     if (mtype == CEED_MEM_HOST) {
137         if (impl->own_) {
138             // data is owned so GPU had the most up-to-date version; copy it
139             magma_getvector(vec->length, sizeof(*array[0]),
140                             impl->darray, 1, impl->array, 1);
141         } else if (impl->array == NULL) {
142             // Vector doesn't own the data and was set on GPU
143             if (impl->darray == NULL) {
144                 // call was made just to allocate memory
145                 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
146                 CeedChk(ierr);
147             } else
148                 return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
149         }
150         *array = impl->array;
151     } else if (mtype == CEED_MEM_DEVICE) {
152         if (impl->darray == NULL){
153             // Vector doesn't own the data and was set on the CPU
154             if (impl->array == NULL) {
155                 // call was made just to allocate memory
156                 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
157                 CeedChk(ierr);
158             } else
159                 return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
160         }
161         *array = impl->darray;
162     } else
163         return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
164 
165     return 0;
166 }
167 
168 // *****************************************************************************
169 // * Give data pointer from vector vec to array (on HOST or DEVICE) to read it
170 // *****************************************************************************
171 static int CeedVectorGetArrayRead_Magma(CeedVector vec, CeedMemType mtype,
172                                         const CeedScalar **array) {
173     CeedVector_Magma *impl = vec->data;
174     int ierr;
175 
176     if (mtype == CEED_MEM_HOST) {
177         if (impl->own_) {
178             // data is owned so GPU had the most up-to-date version; copy it
179             magma_getvector(vec->length, sizeof(*array[0]),
180                             impl->darray, 1, impl->array, 1);
181         } else if (impl->array == NULL) {
182             // Vector doesn't own the data and was set on GPU
183             if (impl->darray == NULL) {
184                 // call was made just to allocate memory
185                 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
186                 CeedChk(ierr);
187             } else
188                 return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
189         }
190         *array = impl->array;
191     } else if (mtype == CEED_MEM_DEVICE) {
192         if (impl->darray == NULL){
193             // Vector doesn't own the data and was set on the CPU
194             if (impl->array == NULL) {
195                 // call was made just to allocate memory
196                 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
197                 CeedChk(ierr);
198             } else
199                 return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
200         }
201         *array = impl->darray;
202     } else
203         return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
204 
205     return 0;
206 }
207 
208 // *****************************************************************************
209 // * There is no mtype here for array so it is not clear if we restore from HOST
210 // * memory or from DEVICE memory. We assume that it is CPU memory because if
211 // * it was GPU memory we would not call this routine at all.
212 // * Restore vector vec with values from array, where array received its values
213 // * from vec and possibly modified them.
214 // *****************************************************************************
215 static int CeedVectorRestoreArray_Magma(CeedVector vec, CeedScalar **array) {
216     CeedVector_Magma *impl = vec->data;
217 
218     // Check if the array is a CPU pointer
219     if (*array == impl->array) {
220         // Update device, if the device pointer is not NULL
221         if (impl->darray != NULL) {
222             magma_setvector(vec->length, sizeof(*array[0]),
223                             *array, 1, impl->darray, 1);
224         } else
225             ; // nothing to do (case of CPU use pointer)
226 
227     } else if (impl->down_) {
228         // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
229         magma_getvector(vec->length, sizeof(*array[0]),
230                         impl->darray, 1, impl->array, 1);
231     }
232 
233     *array = NULL;
234     return 0;
235 }
236 
237 // *****************************************************************************
238 // * There is no mtype here for array so it is not clear if we restore from HOST
239 // * memory or from DEVICE memory. We assume that it is CPU memory because if
240 // * it was GPU memory we would not call this routine at all.
241 // * Restore vector vec with values from array, where array received its values
242 // * from vec to only read them; in this case vec may have been modified meanwhile
243 // * and needs to be restored here.
244 // *****************************************************************************
245 static int CeedVectorRestoreArrayRead_Magma(CeedVector vec,
246                                             const CeedScalar **array) {
247     CeedVector_Magma *impl = vec->data;
248 
249     // Check if the array is a CPU pointer
250     if (*array == impl->array) {
251         // Update device, if the device pointer is not NULL
252         if (impl->darray != NULL)
253             magma_setvector(vec->length, sizeof(*array[0]),
254                             *array, 1, impl->darray, 1);
255         else
256             ; // nothing to do (case of CPU use pointer)
257 
258     } else if (impl->down_) {
259         // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
260         magma_getvector(vec->length, sizeof(*array[0]),
261                         impl->darray, 1, impl->array, 1);
262     }
263 
264     *array = NULL;
265     return 0;
266 }
267 
268 static int CeedVectorDestroy_Magma(CeedVector vec) {
269     CeedVector_Magma *impl = vec->data;
270     int ierr;
271 
272     // Free if we own the data
273     if (impl->own_){
274         ierr = magma_free_pinned(impl->array); CeedChk(ierr);
275         ierr = magma_free(impl->darray);       CeedChk(ierr);
276     } else if (impl->down_){
277         ierr = magma_free(impl->darray);       CeedChk(ierr);
278     }
279     ierr = CeedFree(&vec->data); CeedChk(ierr);
280     return 0;
281 }
282 
283 // *****************************************************************************
284 // * Create vector vec of size n
285 // *****************************************************************************
286 static int CeedVectorCreate_Magma(Ceed ceed, CeedInt n, CeedVector vec) {
287     CeedVector_Magma *impl;
288     int ierr;
289 
290     vec->SetArray = CeedVectorSetArray_Magma;
291     vec->GetArray = CeedVectorGetArray_Magma;
292     vec->GetArrayRead = CeedVectorGetArrayRead_Magma;
293     vec->RestoreArray = CeedVectorRestoreArray_Magma;
294     vec->RestoreArrayRead = CeedVectorRestoreArrayRead_Magma;
295     vec->Destroy = CeedVectorDestroy_Magma;
296     ierr = CeedCalloc(1,&impl); CeedChk(ierr);
297     impl->darray = NULL;
298     impl->array  = NULL;
299     impl->own_ = 0;
300     impl->down_= 0;
301     vec->data = impl;
302     return 0;
303 }
304 
305 // *****************************************************************************
306 // * Apply restriction operator r to u: v = r(rmode) u
307 // *****************************************************************************
308 static int CeedElemRestrictionApply_Magma(CeedElemRestriction r,
309                                           CeedTransposeMode tmode, CeedInt ncomp,
310                                           CeedTransposeMode lmode, CeedVector u,
311                                           CeedVector v, CeedRequest *request) {
312     CeedElemRestriction_Magma *impl = r->data;
313     int ierr;
314     const CeedScalar *uu;
315     CeedScalar *vv;
316     CeedInt nelem = r->nelem, elemsize = r->elemsize, ndof = r->ndof;
317     CeedInt esize = nelem * elemsize;
318     CeedInt *indices = impl->indices, *dindices = impl->dindices;
319 
320     //ierr = CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu); CeedChk(ierr);
321     //ierr = CeedVectorGetArray(v, CEED_MEM_HOST, &vv); CeedChk(ierr);
322 
323     // Get pointers on the device
324     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &uu); CeedChk(ierr);
325     ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &vv); CeedChk(ierr);
326 
327     if (tmode == CEED_NOTRANSPOSE) {
328         // Perform: v = r * u
329         if (ncomp == 1) {
330             //for (CeedInt i=0; i<esize; i++) vv[i] = uu[indices[i]];
331 
332             magma_template<<i=0:esize>>
333                 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices){
334                 vv[i] = uu[dindices[i]];
335             }
336         } else {
337             // vv is (elemsize x ncomp x nelem), column-major
338             if (lmode == CEED_NOTRANSPOSE) { // u is (ndof x ncomp), column-major
339                 /*
340                 for (CeedInt e = 0; e < nelem; e++)
341                     for (CeedInt d = 0; d < ncomp; d++)
342                         for (CeedInt i=0; i < elemsize; i++) {
343                             vv[i + elemsize*(d+ncomp*e)] =
344                                 uu[indices[i+elemsize*e]+ndof*d];
345                         }
346                 */
347                 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
348                     (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, int ndof) {
349                     vv[i + iend*(d+dend*e)] = uu[dindices[i+iend*e]+ndof*d];
350                 }
351             } else { // u is (ncomp x ndof), column-major
352                 /*
353                 for (CeedInt e = 0; e < nelem; e++)
354                     for (CeedInt d = 0; d < ncomp; d++)
355                         for (CeedInt i=0; i< elemsize; i++) {
356                             vv[i + elemsize*(d+ncomp*e)] =
357                                 uu[d+ncomp*indices[i+elemsize*e]];
358                         }
359                 */
360                 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
361                     (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
362                     vv[i + iend*(d+dend*e)] = uu[d+dend*dindices[i + iend*e]];
363                 }
364             }
365         }
366     } else {
367         // Note: in transpose mode, we perform: v += r^t * u
368         if (ncomp == 1) {
369             // fprintf(stderr,"3 ---------\n");
370             // for (CeedInt i=0; i<esize; i++) vv[indices[i]] += uu[i];
371             magma_template<<i=0:esize>>
372                 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
373                 magmablas_datomic_add( &vv[dindices[i]], uu[i]);
374             }
375         } else { // u is (elemsize x ncomp x nelem)
376             fprintf(stderr,"2 ---------\n");
377 
378             if (lmode == CEED_NOTRANSPOSE) { // vv is (ndof x ncomp), column-major
379                 /*
380                 for (CeedInt e = 0; e < nelem; e++)
381                     for (CeedInt d = 0; d < ncomp; d++)
382                         for (CeedInt i=0; i < elemsize; i++) {
383                             vv[indices[i + elemsize*e]+ndof*d] +=
384                                 uu[i + elemsize*(d+e*ncomp)];
385                         }
386                 */
387                 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
388                     (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, CeedInt ndof) {
389                     magmablas_datomic_add( &vv[dindices[i+iend*e]+ndof*d],
390                                            uu[i+iend*(d+e*dend)]);
391                 }
392             } else { // vv is (ncomp x ndof), column-major
393                 /*
394                 for (CeedInt e = 0; e < nelem; e++)
395                     for (CeedInt d = 0; d < ncomp; d++)
396                         for (CeedInt i=0; i < elemsize; i++) {
397                             vv[d+ncomp*indices[i + elemsize*e]] +=
398                                 uu[i + elemsize*(d+e*ncomp)];
399                         }
400                 */
401                 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
402                     (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
403                     magmablas_datomic_add( &vv[d+dend*dindices[i + iend*e]],
404                                            uu[i+iend*(d+e*dend)]);
405                 }
406             }
407         }
408     }
409     ierr = CeedVectorRestoreArrayRead(u, &uu); CeedChk(ierr);
410     ierr = CeedVectorRestoreArray(v, &vv); CeedChk(ierr);
411 
412     if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
413         *request = NULL;
414     return 0;
415 }
416 
417 static int CeedElemRestrictionDestroy_Magma(CeedElemRestriction r) {
418   CeedElemRestriction_Magma *impl = r->data;
419   int ierr;
420 
421   // Free if we own the data
422   if (impl->own_){
423       ierr = magma_free_pinned(impl->indices); CeedChk(ierr);
424       ierr = magma_free(impl->dindices);       CeedChk(ierr);
425   }
426   else if (impl->down_){
427       ierr = magma_free(impl->dindices);       CeedChk(ierr);
428   }
429   ierr = CeedFree(&r->data); CeedChk(ierr);
430   return 0;
431 }
432 
433 static int CeedElemRestrictionCreate_Magma(CeedElemRestriction r,
434                                            CeedMemType mtype,
435                                            CeedCopyMode cmode,
436                                            const CeedInt *indices) {
437     int ierr, size = r->nelem*r->elemsize;
438     CeedElemRestriction_Magma *impl;
439 
440     // Allocate memory for the MAGMA Restricton and initializa pointers to NULL
441     ierr = CeedCalloc(1,&impl); CeedChk(ierr);
442     impl->dindices = NULL;
443     impl->indices  = NULL;
444     impl->own_ = 0;
445     impl->down_= 0;
446 
447     if (mtype == CEED_MEM_HOST) {
448         // memory is on the host; own_ = 0
449         switch (cmode) {
450         case CEED_COPY_VALUES:
451             ierr = magma_malloc( (void**)&impl->dindices,
452                                   size * sizeof(CeedInt)); CeedChk(ierr);
453             ierr = magma_malloc_pinned( (void**)&impl->indices,
454                                         size * sizeof(CeedInt)); CeedChk(ierr);
455             impl->own_ = 1;
456 
457             if (indices != NULL)
458                 magma_setvector(size, sizeof(CeedInt),
459                                 indices, 1, impl->dindices, 1);
460             break;
461         case CEED_OWN_POINTER:
462             ierr = magma_malloc( (void**)&impl->dindices,
463                                  size * sizeof(CeedInt)); CeedChk(ierr);
464             // TODO: possible problem here is if we are passed non-pinned memory;
465             //       (as we own it, lter in destroy, we use free for pinned memory).
466             impl->indices = indices;
467             impl->own_ = 1;
468 
469             if (indices != NULL)
470                 magma_setvector(size, sizeof(CeedInt),
471                                 indices, 1, impl->dindices, 1);
472             break;
473         case CEED_USE_POINTER:
474             ierr = magma_malloc( (void**)&impl->dindices,
475                                  size * sizeof(CeedInt)); CeedChk(ierr);
476             magma_setvector(size, sizeof(CeedInt),
477                             indices, 1, impl->dindices, 1);
478             impl->down_ = 1;
479             impl->indices  = indices;
480         }
481     } else if (mtype == CEED_MEM_DEVICE) {
482         // memory is on the device; own = 0
483         switch (cmode) {
484         case CEED_COPY_VALUES:
485             ierr = magma_malloc( (void**)&impl->dindices,
486                                  size * sizeof(CeedInt)); CeedChk(ierr);
487             ierr = magma_malloc_pinned( (void**)&impl->indices,
488                                         size * sizeof(CeedInt)); CeedChk(ierr);
489             impl->own_ = 1;
490 
491             if (indices)
492                 magma_copyvector(size, sizeof(CeedInt),
493                                  indices, 1, impl->dindices, 1);
494             break;
495         case CEED_OWN_POINTER:
496             impl->dindices = indices;
497             ierr = magma_malloc_pinned( (void**)&impl->indices,
498                                         size * sizeof(CeedInt)); CeedChk(ierr);
499             impl->own_ = 1;
500 
501             break;
502         case CEED_USE_POINTER:
503             impl->dindices = indices;
504             impl->indices  = NULL;
505         }
506 
507     } else
508         return CeedError(r->ceed, 1, "Only MemType = HOST or DEVICE supported");
509 
510     r->data    = impl;
511     r->Apply   = CeedElemRestrictionApply_Magma;
512     r->Destroy = CeedElemRestrictionDestroy_Magma;
513 
514     return 0;
515 }
516 
517 // Contracts on the middle index
518 // NOTRANSPOSE: V_ajc = T_jb U_abc
519 // TRANSPOSE:   V_ajc = T_bj U_abc
520 // If Add != 0, "=" is replaced by "+="
521 static int CeedTensorContract_Magma(Ceed ceed,
522                                     CeedInt A, CeedInt B, CeedInt C, CeedInt J,
523                                     const CeedScalar *t, CeedTransposeMode tmode,
524                                     const CeedInt Add,
525                                     const CeedScalar *u, CeedScalar *v) {
526     CeedInt tstride0 = B, tstride1 = 1;
527     if (tmode == CEED_TRANSPOSE) {
528         tstride0 = 1; tstride1 = J;
529     }
530 
531     for (CeedInt a=0; a<A; a++) {
532         for (CeedInt j=0; j<J; j++) {
533             if (!Add) {
534                 for (CeedInt c=0; c<C; c++)
535                     v[(a*J+j)*C+c] = 0;
536             }
537             for (CeedInt b=0; b<B; b++) {
538                 for (CeedInt c=0; c<C; c++) {
539                     v[(a*J+j)*C+c] += t[j*tstride0 + b*tstride1] * u[(a*B+b)*C+c];
540                 }
541             }
542         }
543     }
544     return 0;
545 }
546 
547 static int CeedBasisApply_Magma(CeedBasis basis, CeedTransposeMode tmode,
548                                 CeedEvalMode emode,
549                                 const CeedScalar *u, CeedScalar *v) {
550     int ierr;
551     const CeedInt dim = basis->dim;
552     const CeedInt ndof = basis->ndof;
553     const CeedInt nqpt = ndof*CeedPowInt(basis->Q1d, dim);
554     const CeedInt add = (tmode == CEED_TRANSPOSE);
555 
556     if (tmode == CEED_TRANSPOSE) {
557         const CeedInt vsize = ndof*CeedPowInt(basis->P1d, dim);
558         for (CeedInt i = 0; i < vsize; i++)
559             v[i] = (CeedScalar) 0;
560     }
561     if (emode & CEED_EVAL_INTERP) {
562         CeedInt P = basis->P1d, Q = basis->Q1d;
563         if (tmode == CEED_TRANSPOSE) {
564             P = basis->Q1d; Q = basis->P1d;
565         }
566         CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1;
567         CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)];
568         for (CeedInt d=0; d<dim; d++) {
569             ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, basis->interp1d,
570                                             tmode, add&&(d==dim-1),
571                                             d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
572             CeedChk(ierr);
573             pre /= P;
574             post *= Q;
575         }
576         if (tmode == CEED_NOTRANSPOSE) {
577             v += nqpt;
578         } else {
579             u += nqpt;
580         }
581     }
582     if (emode & CEED_EVAL_GRAD) {
583         CeedInt P = basis->P1d, Q = basis->Q1d;
584         // In CEED_NOTRANSPOSE mode:
585         // u is (P^dim x nc), column-major layout (nc = ndof)
586         // v is (Q^dim x nc x dim), column-major layout (nc = ndof)
587         // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
588         if (tmode == CEED_TRANSPOSE) {
589             P = basis->Q1d, Q = basis->P1d;
590         }
591         CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)];
592         for (CeedInt p = 0; p < dim; p++) {
593             CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1;
594             for (CeedInt d=0; d<dim; d++) {
595                 ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q,
596                                                 (p==d)?basis->grad1d:basis->interp1d,
597                                                 tmode, add&&(d==dim-1),
598                                                 d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
599                 CeedChk(ierr);
600                 pre /= P;
601                 post *= Q;
602             }
603             if (tmode == CEED_NOTRANSPOSE) {
604                 v += nqpt;
605             } else {
606                 u += nqpt;
607             }
608         }
609     }
610     if (emode & CEED_EVAL_WEIGHT) {
611         if (tmode == CEED_TRANSPOSE)
612             return CeedError(basis->ceed, 1,
613                              "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
614         CeedInt Q = basis->Q1d;
615         for (CeedInt d=0; d<dim; d++) {
616             CeedInt pre = CeedPowInt(Q, dim-d-1), post = CeedPowInt(Q, d);
617             for (CeedInt i=0; i<pre; i++) {
618                 for (CeedInt j=0; j<Q; j++) {
619                     for (CeedInt k=0; k<post; k++) {
620                         v[(i*Q + j)*post + k] = basis->qweight1d[j]
621                             * (d == 0 ? 1 : v[(i*Q + j)*post + k]);
622                     }
623                 }
624             }
625         }
626     }
627     return 0;
628 }
629 
630 static int CeedBasisDestroy_Magma(CeedBasis basis) {
631     return 0;
632 }
633 
634 static int CeedBasisCreateTensorH1_Magma(Ceed ceed, CeedInt dim, CeedInt P1d,
635                                          CeedInt Q1d, const CeedScalar *interp1d,
636                                          const CeedScalar *grad1d,
637                                          const CeedScalar *qref1d,
638                                          const CeedScalar *qweight1d,
639                                          CeedBasis basis) {
640     basis->Apply = CeedBasisApply_Magma;
641     basis->Destroy = CeedBasisDestroy_Magma;
642     return 0;
643 }
644 
645 static int CeedQFunctionApply_Magma(CeedQFunction qf, void *qdata, CeedInt Q,
646                                     const CeedScalar *const *u,
647                                     CeedScalar *const *v) {
648     int ierr;
649     ierr = qf->function(qf->ctx, qdata, Q, u, v); CeedChk(ierr);
650     return 0;
651 }
652 
653 static int CeedQFunctionDestroy_Magma(CeedQFunction qf) {
654     return 0;
655 }
656 
657 static int CeedQFunctionCreate_Magma(CeedQFunction qf) {
658     qf->Apply = CeedQFunctionApply_Magma;
659     qf->Destroy = CeedQFunctionDestroy_Magma;
660     return 0;
661 }
662 
663 static int CeedOperatorDestroy_Magma(CeedOperator op) {
664     CeedOperator_Magma *impl = op->data;
665     int ierr;
666 
667     ierr = CeedVectorDestroy(&impl->etmp); CeedChk(ierr);
668     ierr = CeedVectorDestroy(&impl->qdata); CeedChk(ierr);
669     ierr = CeedFree(&op->data); CeedChk(ierr);
670     return 0;
671 }
672 
673 static int CeedOperatorApply_Magma(CeedOperator op, CeedVector qdata,
674                                    CeedVector ustate,
675                                    CeedVector residual, CeedRequest *request) {
676     CeedOperator_Magma *impl = op->data;
677     CeedVector etmp;
678     CeedInt Q;
679     const CeedInt nc = op->basis->ndof, dim = op->basis->dim;
680     CeedScalar *Eu;
681     char *qd;
682     int ierr;
683     CeedTransposeMode lmode = CEED_NOTRANSPOSE;
684 
685     if (!impl->etmp) {
686         ierr = CeedVectorCreate(op->ceed,
687                                 nc * op->Erestrict->nelem * op->Erestrict->elemsize,
688                                 &impl->etmp); CeedChk(ierr);
689         // etmp is allocated when CeedVectorGetArray is called below
690     }
691     etmp = impl->etmp;
692     if (op->qf->inmode & ~CEED_EVAL_WEIGHT) {
693         ierr = CeedElemRestrictionApply(op->Erestrict, CEED_NOTRANSPOSE,
694                                         nc, lmode, ustate, etmp,
695                                         CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
696     }
697     ierr = CeedBasisGetNumQuadraturePoints(op->basis, &Q); CeedChk(ierr);
698     ierr = CeedVectorGetArray(etmp, CEED_MEM_HOST, &Eu); CeedChk(ierr);
699     ierr = CeedVectorGetArray(qdata, CEED_MEM_HOST, (CeedScalar**)&qd);
700     CeedChk(ierr);
701     for (CeedInt e=0; e<op->Erestrict->nelem; e++) {
702         CeedScalar BEu[Q*nc*(dim+2)], BEv[Q*nc*(dim+2)], *out[5] = {0,0,0,0,0};
703         const CeedScalar *in[5] = {0,0,0,0,0};
704         // TODO: quadrature weights can be computed just once
705         ierr = CeedBasisApply(op->basis, CEED_NOTRANSPOSE, op->qf->inmode,
706                               &Eu[e*op->Erestrict->elemsize*nc], BEu);
707         CeedChk(ierr);
708         CeedScalar *u_ptr = BEu, *v_ptr = BEv;
709         if (op->qf->inmode & CEED_EVAL_INTERP) { in[0] = u_ptr; u_ptr += Q*nc; }
710         if (op->qf->inmode & CEED_EVAL_GRAD) { in[1] = u_ptr; u_ptr += Q*nc*dim; }
711         if (op->qf->inmode & CEED_EVAL_WEIGHT) { in[4] = u_ptr; u_ptr += Q; }
712         if (op->qf->outmode & CEED_EVAL_INTERP) { out[0] = v_ptr; v_ptr += Q*nc; }
713         if (op->qf->outmode & CEED_EVAL_GRAD) { out[1] = v_ptr; v_ptr += Q*nc*dim; }
714         ierr = CeedQFunctionApply(op->qf, &qd[e*Q*op->qf->qdatasize], Q, in, out);
715         CeedChk(ierr);
716         ierr = CeedBasisApply(op->basis, CEED_TRANSPOSE, op->qf->outmode, BEv,
717                               &Eu[e*op->Erestrict->elemsize*nc]);
718         CeedChk(ierr);
719     }
720     ierr = CeedVectorRestoreArray(etmp, &Eu); CeedChk(ierr);
721     if (residual) {
722         CeedScalar *res;
723         CeedVectorGetArray(residual, CEED_MEM_HOST, &res);
724         for (int i = 0; i < residual->length; i++)
725             res[i] = (CeedScalar)0;
726         ierr = CeedElemRestrictionApply(op->Erestrict, CEED_TRANSPOSE,
727                                         nc, lmode, etmp, residual,
728                                         CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
729     }
730     if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
731         *request = NULL;
732     return 0;
733 }
734 
735 static int CeedOperatorGetQData_Magma(CeedOperator op, CeedVector *qdata) {
736     CeedOperator_Magma *impl = op->data;
737     int ierr;
738 
739     if (!impl->qdata) {
740         CeedInt Q;
741         ierr = CeedBasisGetNumQuadraturePoints(op->basis, &Q); CeedChk(ierr);
742         ierr = CeedVectorCreate(op->ceed,
743                                 op->Erestrict->nelem * Q
744                                 * op->qf->qdatasize / sizeof(CeedScalar),
745                                 &impl->qdata); CeedChk(ierr);
746     }
747     *qdata = impl->qdata;
748     return 0;
749 }
750 
751 static int CeedOperatorCreate_Magma(CeedOperator op) {
752     CeedOperator_Magma *impl;
753     int ierr;
754 
755     ierr = CeedCalloc(1, &impl); CeedChk(ierr);
756     op->data = impl;
757     op->Destroy = CeedOperatorDestroy_Magma;
758     op->Apply = CeedOperatorApply_Magma;
759     op->GetQData = CeedOperatorGetQData_Magma;
760     return 0;
761 }
762 
763 // *****************************************************************************
764 // * INIT
765 // *****************************************************************************
766 static int CeedInit_Magma(const char *resource, Ceed ceed) {
767     if (strcmp(resource, "/cpu/magma")
768         && strcmp(resource, "/gpu/magma"))
769         return CeedError(ceed, 1, "MAGMA backend cannot use resource: %s", resource);
770 
771     magma_init();
772     //magma_print_environment();
773 
774     ceed->VecCreate = CeedVectorCreate_Magma;
775     ceed->BasisCreateTensorH1 = CeedBasisCreateTensorH1_Magma;
776     ceed->ElemRestrictionCreate = CeedElemRestrictionCreate_Magma;
777     ceed->QFunctionCreate = CeedQFunctionCreate_Magma;
778     ceed->OperatorCreate = CeedOperatorCreate_Magma;
779     return 0;
780 }
781 
782 // *****************************************************************************
783 // * REGISTER
784 // *****************************************************************************
785 __attribute__((constructor))
786 static void Register(void) {
787     CeedRegister("/cpu/magma", CeedInit_Magma);
788     CeedRegister("/gpu/magma", CeedInit_Magma);
789 }
790