xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma.c (revision e2900954d97d770389bd849c0b8ea2ae8cc42900)
1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
3 // reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed-impl.h>
18 #include <string.h>
19 #include "magma.h"
20 
21 typedef struct {
22     CeedScalar *array;
23     CeedScalar *darray;
24     int own_;
25 } CeedVector_Magma;
26 
27 typedef struct {
28     const CeedInt *indices;
29     CeedInt *indices_allocated;
30 } CeedElemRestriction_Magma;
31 
32 typedef struct {
33     CeedVector etmp;
34     CeedVector qdata;
35 } CeedOperator_Magma;
36 
37 // *****************************************************************************
38 // * Initialize vector vec (after free mem) with values from array based on cmode
39 // *   CEED_COPY_VALUES: memory is allocated in vec->array_allocated, made equal
40 // *                     to array, and data is copied (not store passed pointer)
41 // *   CEED_OWN_POINTER: vec->data->array_allocated and vec->data->array = array
42 // *   CEED_USE_POINTER: vec->data->array = array (can modify; no ownership)
43 // * mtype: CEED_MEM_HOST or CEED_MEM_DEVICE
44 // *****************************************************************************
45 static int CeedVectorSetArray_Magma(CeedVector vec, CeedMemType mtype,
46                                     CeedCopyMode cmode, CeedScalar *array) {
47     CeedVector_Magma *impl = vec->data;
48     int ierr;
49 
50     // If own data, free that "old" data, e.g., as it may be of different size
51     if (impl->own_){
52         magma_free( impl->darray );
53         magma_free_pinned( impl->array );
54         impl->darray = NULL;
55         impl->array  = NULL;
56         impl->own_ = 0;
57     }
58 
59     if (mtype == CEED_MEM_HOST) {
60         // memory is on the host; own_ = 0
61         switch (cmode) {
62         case CEED_COPY_VALUES:
63             ierr = magma_malloc( (void**)&impl->darray,
64                                  vec->length * sizeof(CeedScalar)); CeedChk(ierr);
65             ierr = magma_malloc_pinned( (void**)&impl->array,
66                                         vec->length * sizeof(CeedScalar)); CeedChk(ierr);
67             impl->own_ = 1;
68 
69             if (array)
70                 magma_setvector(vec->length, sizeof(array[0]),
71                                 array, 1, impl->darray, 1);
72             break;
73         case CEED_OWN_POINTER:
74             ierr = magma_malloc( (void**)&impl->darray,
75                                  vec->length * sizeof(CeedScalar)); CeedChk(ierr);
76             impl->array = array;
77             impl->own_ = 1;
78 
79             if (array)
80                 magma_setvector(vec->length, sizeof(array[0]),
81                                 array, 1, impl->darray, 1);
82             break;
83         case CEED_USE_POINTER:
84             impl->darray = NULL;
85             impl->array  = array;
86         }
87     } else if (mtype == CEED_MEM_DEVICE) {
88         // memory is on the device; own = 0
89         switch (cmode) {
90         case CEED_COPY_VALUES:
91             ierr = magma_malloc( (void**)&impl->darray,
92                                 vec->length * sizeof(CeedScalar)); CeedChk(ierr);
93             ierr = magma_malloc_pinned( (void**)&impl->array,
94                                         vec->length * sizeof(CeedScalar)); CeedChk(ierr);
95             impl->own_ = 1;
96 
97             if (array)
98                 magma_copyvector(vec->length, sizeof(array[0]),
99                                  array, 1, impl->darray, 1);
100             break;
101         case CEED_OWN_POINTER:
102             impl->darray = array;
103             ierr = magma_malloc_pinned( (void**)&impl->array,
104                                         vec->length * sizeof(CeedScalar)); CeedChk(ierr);
105             impl->own_ = 1;
106 
107             break;
108         case CEED_USE_POINTER:
109             impl->darray = array;
110             impl->array  = NULL;
111         }
112 
113     } else
114         return CeedError(vec->ceed, 1, "Only MemType = HOST or DEVICE supported");
115 
116     return 0;
117 }
118 
119 // *****************************************************************************
120 // * Give data pointer from vector vec to array (on HOST or DEVICE)
121 // *****************************************************************************
122 static int CeedVectorGetArray_Magma(CeedVector vec, CeedMemType mtype,
123                                     CeedScalar **array) {
124     CeedVector_Magma *impl = vec->data;
125     int ierr;
126 
127     if (mtype == CEED_MEM_HOST) {
128         if (!impl->array){
129             // Allocate data if array is NULL
130             ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
131             CeedChk(ierr);
132         } else if (impl->own_) {
133             // data is owned so GPU had the most up-to-date version; copy it
134             magma_getvector(vec->length, sizeof(*array[0]),
135                             impl->darray, 1, impl->array, 1);
136         }
137         *array = impl->array;
138     } else if (mtype == CEED_MEM_DEVICE) {
139         if (!impl->darray){
140             // Allocate data if darray is NULL
141             ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
142             CeedChk(ierr);
143         }
144         *array = impl->darray;
145     } else
146         return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
147 
148     return 0;
149 }
150 
151 // *****************************************************************************
152 // * Give data pointer from vector vec to array (on HOST or DEVICE) to read it
153 // *****************************************************************************
154 static int CeedVectorGetArrayRead_Magma(CeedVector vec, CeedMemType mtype,
155                                       const CeedScalar **array) {
156     CeedVector_Magma *impl = vec->data;
157     int ierr;
158 
159     if (mtype == CEED_MEM_HOST) {
160         if (!impl->array){
161             // Allocate data if array is NULL
162             ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
163             CeedChk(ierr);
164         } else if (impl->own_) {
165             // data is owned so GPU had the most up-to-date version; copy it
166             magma_getvector(vec->length, sizeof(*array[0]),
167                             impl->darray, 1, impl->array, 1);
168         }
169         *array = impl->array;
170     } else if (mtype == CEED_MEM_DEVICE) {
171         if (!impl->darray){
172             // Allocate data if darray is NULL
173             ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
174             CeedChk(ierr);
175         }
176         *array = impl->darray;
177     } else
178         return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
179 
180     return 0;
181 }
182 
183 // *****************************************************************************
184 // * There is no mtype here for array so it is not clear if we restore from HOST
185 // * memory or from DEVICE memory. We assume that it is CPU memory because if
186 // * it was GPU memory we would not call this routine at all.
187 // * Restore vector vec with values from array, where array received its values
188 // * from vec and possibly modified them.
189 // *****************************************************************************
190 static int CeedVectorRestoreArray_Magma(CeedVector vec, CeedScalar **array) {
191     CeedVector_Magma *impl = vec->data;
192 
193     if (impl->darray!=NULL)
194         magma_setvector(vec->length, sizeof(*array[0]),
195                         *array, 1, impl->darray, 1);
196 
197     *array = NULL;
198     return 0;
199 }
200 
201 // *****************************************************************************
202 // * There is no mtype here for array so it is not clear if we restore from HOST
203 // * memory or from DEVICE memory. We assume that it is CPU memory because if
204 // * it was GPU memory we would not call this routine at all.
205 // * Restore vector vec with values from array, where array received its values
206 // * from vec to only read them; in this case vec may have been modified meanwhile
207 // * and needs to be restored here.
208 // *****************************************************************************
209 static int CeedVectorRestoreArrayRead_Magma(CeedVector vec,
210                                             const CeedScalar **array) {
211     CeedVector_Magma *impl = vec->data;
212 
213     if (impl->darray!=NULL)
214         magma_setvector(vec->length, sizeof(*array[0]),
215                         *array, 1, impl->darray, 1);
216 
217     *array = NULL;
218     return 0;
219 }
220 
221 static int CeedVectorDestroy_Magma(CeedVector vec) {
222     CeedVector_Magma *impl = vec->data;
223     int ierr;
224 
225     // Free if we own the data
226     if (impl->own_){
227         ierr = magma_free_pinned(impl->array); CeedChk(ierr);
228         ierr = magma_free(impl->darray);       CeedChk(ierr);
229     }
230     ierr = CeedFree(&vec->data); CeedChk(ierr);
231     return 0;
232 }
233 
234 // *****************************************************************************
235 // * Create vector vec of size n
236 // *****************************************************************************
237 static int CeedVectorCreate_Magma(Ceed ceed, CeedInt n, CeedVector vec) {
238     CeedVector_Magma *impl;
239     int ierr;
240 
241     vec->SetArray = CeedVectorSetArray_Magma;
242     vec->GetArray = CeedVectorGetArray_Magma;
243     vec->GetArrayRead = CeedVectorGetArrayRead_Magma;
244     vec->RestoreArray = CeedVectorRestoreArray_Magma;
245     vec->RestoreArrayRead = CeedVectorRestoreArrayRead_Magma;
246     vec->Destroy = CeedVectorDestroy_Magma;
247     ierr = CeedCalloc(1,&impl); CeedChk(ierr);
248     impl->darray = NULL;
249     impl->array  = NULL;
250     impl->own_ = 0;
251     vec->data = impl;
252     return 0;
253 }
254 
255 // *****************************************************************************
256 // * Apply restriction operator r to u: v = r(rmode) u
257 // *****************************************************************************
258 static int CeedElemRestrictionApply_Magma(CeedElemRestriction r,
259                                         CeedTransposeMode tmode, CeedInt ncomp,
260                                         CeedTransposeMode lmode, CeedVector u,
261                                         CeedVector v, CeedRequest *request) {
262     CeedElemRestriction_Magma *impl = r->data;
263     int ierr;
264     const CeedScalar *uu;
265     CeedScalar *vv;
266     CeedInt esize = r->nelem*r->elemsize;
267     //printf("HELLOOOOOOOOO=======================\n");
268 
269     ierr = CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu); CeedChk(ierr);
270     ierr = CeedVectorGetArray(v, CEED_MEM_HOST, &vv); CeedChk(ierr);
271     if (tmode == CEED_NOTRANSPOSE) {
272         // Perform: v = r * u
273         if (ncomp == 1) {
274             for (CeedInt i=0; i<esize; i++) vv[i] = uu[impl->indices[i]];
275 
276             /*
277             // Works - in t05 x has to be with CEED_COPY_VALUES
278             CeedVector_Magma *uimpl = u->data, *vimpl = v->data;
279             uu = uimpl->darray;
280             vv = vimpl->darray;
281             CeedInt *indices;// = (int *)impl->indices;
282             magma_malloc( (void**)&indices, esize * sizeof(CeedInt));
283             magma_setvector(esize, sizeof(CeedInt),
284                             (int *)impl->indices, 1, indices, 1);
285             magma_template<<i=0:esize>>(const CeedScalar *uu, CeedScalar *vv, CeedInt *indices)
286               {
287                  vv[i] = uu[indices[i]];
288               }
289 
290             // printf("HELLOOOOOOOOO....... size = %d\n", esize);
291             //
292 
293             if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
294                 *request = NULL;
295             return 0;
296             */
297 
298         } else {
299             //printf("HELLOOOOOOOOO-------------\n");
300             // vv is (elemsize x ncomp x nelem), column-major
301             if (lmode == CEED_NOTRANSPOSE) { // u is (ndof x ncomp), column-major
302                 for (CeedInt e = 0; e < r->nelem; e++)
303                     for (CeedInt d = 0; d < ncomp; d++)
304                         for (CeedInt i=0; i<r->elemsize; i++) {
305                             vv[i+r->elemsize*(d+ncomp*e)] =
306                                 uu[impl->indices[i+r->elemsize*e]+r->ndof*d];
307                         }
308             } else { // u is (ncomp x ndof), column-major
309                 for (CeedInt e = 0; e < r->nelem; e++)
310                     for (CeedInt d = 0; d < ncomp; d++)
311                         for (CeedInt i=0; i<r->elemsize; i++) {
312                             vv[i+r->elemsize*(d+ncomp*e)] =
313                                 uu[d+ncomp*impl->indices[i+r->elemsize*e]];
314                         }
315             }
316         }
317     } else {
318         // Note: in transpose mode, we perform: v += r^t * u
319         if (ncomp == 1) {
320             for (CeedInt i=0; i<esize; i++) vv[impl->indices[i]] += uu[i];
321         } else {
322             //printf("HELLOOOOOOOOO+++++++++++++\n");
323             // u is (elemsize x ncomp x nelem)
324             if (lmode == CEED_NOTRANSPOSE) { // vv is (ndof x ncomp), column-major
325                 for (CeedInt e = 0; e < r->nelem; e++)
326                     for (CeedInt d = 0; d < ncomp; d++)
327                         for (CeedInt i=0; i<r->elemsize; i++) {
328                             vv[impl->indices[i+r->elemsize*e]+r->ndof*d] +=
329                                 uu[i+r->elemsize*(d+e*ncomp)];
330                         }
331             } else { // vv is (ncomp x ndof), column-major
332                 for (CeedInt e = 0; e < r->nelem; e++)
333                     for (CeedInt d = 0; d < ncomp; d++)
334                         for (CeedInt i=0; i<r->elemsize; i++) {
335                             vv[d+ncomp*impl->indices[i+r->elemsize*e]] +=
336                                 uu[i+r->elemsize*(d+e*ncomp)];
337                         }
338             }
339         }
340     }
341     ierr = CeedVectorRestoreArrayRead(u, &uu); CeedChk(ierr);
342     ierr = CeedVectorRestoreArray(v, &vv); CeedChk(ierr);
343     if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
344         *request = NULL;
345     return 0;
346 }
347 
348 static int CeedElemRestrictionDestroy_Magma(CeedElemRestriction r) {
349   CeedElemRestriction_Magma *impl = r->data;
350   int ierr;
351 
352   ierr = CeedFree(&impl->indices_allocated); CeedChk(ierr);
353   ierr = CeedFree(&r->data); CeedChk(ierr);
354   return 0;
355 }
356 
357 static int CeedElemRestrictionCreate_Magma(CeedElemRestriction r,
358                                            CeedMemType mtype,
359                                            CeedCopyMode cmode,
360                                            const CeedInt *indices) {
361     int ierr;
362     CeedElemRestriction_Magma *impl;
363 
364     if (mtype != CEED_MEM_HOST)
365         return CeedError(r->ceed, 1, "Only MemType = HOST supported");
366     ierr = CeedCalloc(1,&impl); CeedChk(ierr);
367     switch (cmode) {
368     case CEED_COPY_VALUES:
369         //printf("CeedElemRestriction_Magma COPY\n");
370         ierr = CeedMalloc(r->nelem*r->elemsize, &impl->indices_allocated);
371         CeedChk(ierr);
372         memcpy(impl->indices_allocated, indices,
373                r->nelem * r->elemsize * sizeof(indices[0]));
374         impl->indices = impl->indices_allocated;
375         break;
376     case CEED_OWN_POINTER:
377         //printf("CeedElemRestriction_Magma OWN\n");
378         impl->indices_allocated = (CeedInt *)indices;
379         impl->indices = impl->indices_allocated;
380         break;
381     case CEED_USE_POINTER:
382         //printf("CeedElemRestriction_Magma USE\n");
383         impl->indices = indices;
384     }
385     r->data = impl;
386     r->Apply = CeedElemRestrictionApply_Magma;
387     r->Destroy = CeedElemRestrictionDestroy_Magma;
388     return 0;
389 }
390 
391 // Contracts on the middle index
392 // NOTRANSPOSE: V_ajc = T_jb U_abc
393 // TRANSPOSE:   V_ajc = T_bj U_abc
394 // If Add != 0, "=" is replaced by "+="
395 static int CeedTensorContract_Magma(Ceed ceed,
396                                     CeedInt A, CeedInt B, CeedInt C, CeedInt J,
397                                     const CeedScalar *t, CeedTransposeMode tmode,
398                                     const CeedInt Add,
399                                     const CeedScalar *u, CeedScalar *v) {
400     CeedInt tstride0 = B, tstride1 = 1;
401     if (tmode == CEED_TRANSPOSE) {
402         tstride0 = 1; tstride1 = J;
403     }
404 
405     for (CeedInt a=0; a<A; a++) {
406         for (CeedInt j=0; j<J; j++) {
407             if (!Add) {
408                 for (CeedInt c=0; c<C; c++)
409                     v[(a*J+j)*C+c] = 0;
410             }
411             for (CeedInt b=0; b<B; b++) {
412                 for (CeedInt c=0; c<C; c++) {
413                     v[(a*J+j)*C+c] += t[j*tstride0 + b*tstride1] * u[(a*B+b)*C+c];
414                 }
415             }
416         }
417     }
418     return 0;
419 }
420 
421 static int CeedBasisApply_Magma(CeedBasis basis, CeedTransposeMode tmode,
422                                 CeedEvalMode emode,
423                                 const CeedScalar *u, CeedScalar *v) {
424     int ierr;
425     const CeedInt dim = basis->dim;
426     const CeedInt ndof = basis->ndof;
427     const CeedInt nqpt = ndof*CeedPowInt(basis->Q1d, dim);
428     const CeedInt add = (tmode == CEED_TRANSPOSE);
429 
430     if (tmode == CEED_TRANSPOSE) {
431         const CeedInt vsize = ndof*CeedPowInt(basis->P1d, dim);
432         for (CeedInt i = 0; i < vsize; i++)
433             v[i] = (CeedScalar) 0;
434     }
435     if (emode & CEED_EVAL_INTERP) {
436         CeedInt P = basis->P1d, Q = basis->Q1d;
437         if (tmode == CEED_TRANSPOSE) {
438             P = basis->Q1d; Q = basis->P1d;
439         }
440         CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1;
441         CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)];
442         for (CeedInt d=0; d<dim; d++) {
443             ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, basis->interp1d,
444                                             tmode, add&&(d==dim-1),
445                                             d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
446             CeedChk(ierr);
447             pre /= P;
448             post *= Q;
449         }
450         if (tmode == CEED_NOTRANSPOSE) {
451             v += nqpt;
452         } else {
453             u += nqpt;
454         }
455     }
456     if (emode & CEED_EVAL_GRAD) {
457         CeedInt P = basis->P1d, Q = basis->Q1d;
458         // In CEED_NOTRANSPOSE mode:
459         // u is (P^dim x nc), column-major layout (nc = ndof)
460         // v is (Q^dim x nc x dim), column-major layout (nc = ndof)
461         // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
462         if (tmode == CEED_TRANSPOSE) {
463             P = basis->Q1d, Q = basis->P1d;
464         }
465         CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)];
466         for (CeedInt p = 0; p < dim; p++) {
467             CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1;
468             for (CeedInt d=0; d<dim; d++) {
469                 ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q,
470                                                 (p==d)?basis->grad1d:basis->interp1d,
471                                                 tmode, add&&(d==dim-1),
472                                                 d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
473                 CeedChk(ierr);
474                 pre /= P;
475                 post *= Q;
476             }
477             if (tmode == CEED_NOTRANSPOSE) {
478                 v += nqpt;
479             } else {
480                 u += nqpt;
481             }
482         }
483     }
484     if (emode & CEED_EVAL_WEIGHT) {
485         if (tmode == CEED_TRANSPOSE)
486             return CeedError(basis->ceed, 1,
487                              "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
488         CeedInt Q = basis->Q1d;
489         for (CeedInt d=0; d<dim; d++) {
490             CeedInt pre = CeedPowInt(Q, dim-d-1), post = CeedPowInt(Q, d);
491             for (CeedInt i=0; i<pre; i++) {
492                 for (CeedInt j=0; j<Q; j++) {
493                     for (CeedInt k=0; k<post; k++) {
494                         v[(i*Q + j)*post + k] = basis->qweight1d[j]
495                             * (d == 0 ? 1 : v[(i*Q + j)*post + k]);
496                     }
497                 }
498             }
499         }
500     }
501     return 0;
502 }
503 
504 static int CeedBasisDestroy_Magma(CeedBasis basis) {
505     return 0;
506 }
507 
508 static int CeedBasisCreateTensorH1_Magma(Ceed ceed, CeedInt dim, CeedInt P1d,
509                                          CeedInt Q1d, const CeedScalar *interp1d,
510                                          const CeedScalar *grad1d,
511                                          const CeedScalar *qref1d,
512                                          const CeedScalar *qweight1d,
513                                          CeedBasis basis) {
514     basis->Apply = CeedBasisApply_Magma;
515     basis->Destroy = CeedBasisDestroy_Magma;
516     return 0;
517 }
518 
519 static int CeedQFunctionApply_Magma(CeedQFunction qf, void *qdata, CeedInt Q,
520                                     const CeedScalar *const *u,
521                                     CeedScalar *const *v) {
522     int ierr;
523     ierr = qf->function(qf->ctx, qdata, Q, u, v); CeedChk(ierr);
524     return 0;
525 }
526 
527 static int CeedQFunctionDestroy_Magma(CeedQFunction qf) {
528     return 0;
529 }
530 
531 static int CeedQFunctionCreate_Magma(CeedQFunction qf) {
532     qf->Apply = CeedQFunctionApply_Magma;
533     qf->Destroy = CeedQFunctionDestroy_Magma;
534     return 0;
535 }
536 
537 static int CeedOperatorDestroy_Magma(CeedOperator op) {
538     CeedOperator_Magma *impl = op->data;
539     int ierr;
540 
541     ierr = CeedVectorDestroy(&impl->etmp); CeedChk(ierr);
542     ierr = CeedVectorDestroy(&impl->qdata); CeedChk(ierr);
543     ierr = CeedFree(&op->data); CeedChk(ierr);
544     return 0;
545 }
546 
547 static int CeedOperatorApply_Magma(CeedOperator op, CeedVector qdata,
548                                    CeedVector ustate,
549                                    CeedVector residual, CeedRequest *request) {
550     CeedOperator_Magma *impl = op->data;
551     CeedVector etmp;
552     CeedInt Q;
553     const CeedInt nc = op->basis->ndof, dim = op->basis->dim;
554     CeedScalar *Eu;
555     char *qd;
556     int ierr;
557     CeedTransposeMode lmode = CEED_NOTRANSPOSE;
558 
559     if (!impl->etmp) {
560         ierr = CeedVectorCreate(op->ceed,
561                                 nc * op->Erestrict->nelem * op->Erestrict->elemsize,
562                                 &impl->etmp); CeedChk(ierr);
563         // etmp is allocated when CeedVectorGetArray is called below
564     }
565     etmp = impl->etmp;
566     if (op->qf->inmode & ~CEED_EVAL_WEIGHT) {
567         ierr = CeedElemRestrictionApply(op->Erestrict, CEED_NOTRANSPOSE,
568                                         nc, lmode, ustate, etmp,
569                                         CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
570     }
571     ierr = CeedBasisGetNumQuadraturePoints(op->basis, &Q); CeedChk(ierr);
572     ierr = CeedVectorGetArray(etmp, CEED_MEM_HOST, &Eu); CeedChk(ierr);
573     ierr = CeedVectorGetArray(qdata, CEED_MEM_HOST, (CeedScalar**)&qd);
574     CeedChk(ierr);
575     for (CeedInt e=0; e<op->Erestrict->nelem; e++) {
576         CeedScalar BEu[Q*nc*(dim+2)], BEv[Q*nc*(dim+2)], *out[5] = {0,0,0,0,0};
577         const CeedScalar *in[5] = {0,0,0,0,0};
578         // TODO: quadrature weights can be computed just once
579         ierr = CeedBasisApply(op->basis, CEED_NOTRANSPOSE, op->qf->inmode,
580                               &Eu[e*op->Erestrict->elemsize*nc], BEu);
581         CeedChk(ierr);
582         CeedScalar *u_ptr = BEu, *v_ptr = BEv;
583         if (op->qf->inmode & CEED_EVAL_INTERP) { in[0] = u_ptr; u_ptr += Q*nc; }
584         if (op->qf->inmode & CEED_EVAL_GRAD) { in[1] = u_ptr; u_ptr += Q*nc*dim; }
585         if (op->qf->inmode & CEED_EVAL_WEIGHT) { in[4] = u_ptr; u_ptr += Q; }
586         if (op->qf->outmode & CEED_EVAL_INTERP) { out[0] = v_ptr; v_ptr += Q*nc; }
587         if (op->qf->outmode & CEED_EVAL_GRAD) { out[1] = v_ptr; v_ptr += Q*nc*dim; }
588         ierr = CeedQFunctionApply(op->qf, &qd[e*Q*op->qf->qdatasize], Q, in, out);
589         CeedChk(ierr);
590         ierr = CeedBasisApply(op->basis, CEED_TRANSPOSE, op->qf->outmode, BEv,
591                               &Eu[e*op->Erestrict->elemsize*nc]);
592         CeedChk(ierr);
593     }
594     ierr = CeedVectorRestoreArray(etmp, &Eu); CeedChk(ierr);
595     if (residual) {
596         CeedScalar *res;
597         CeedVectorGetArray(residual, CEED_MEM_HOST, &res);
598         for (int i = 0; i < residual->length; i++)
599             res[i] = (CeedScalar)0;
600         ierr = CeedElemRestrictionApply(op->Erestrict, CEED_TRANSPOSE,
601                                         nc, lmode, etmp, residual,
602                                         CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
603     }
604     if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
605         *request = NULL;
606     return 0;
607 }
608 
609 static int CeedOperatorGetQData_Magma(CeedOperator op, CeedVector *qdata) {
610     CeedOperator_Magma *impl = op->data;
611     int ierr;
612 
613     if (!impl->qdata) {
614         CeedInt Q;
615         ierr = CeedBasisGetNumQuadraturePoints(op->basis, &Q); CeedChk(ierr);
616         ierr = CeedVectorCreate(op->ceed,
617                                 op->Erestrict->nelem * Q
618                                 * op->qf->qdatasize / sizeof(CeedScalar),
619                                 &impl->qdata); CeedChk(ierr);
620     }
621     *qdata = impl->qdata;
622     return 0;
623 }
624 
625 static int CeedOperatorCreate_Magma(CeedOperator op) {
626     CeedOperator_Magma *impl;
627     int ierr;
628 
629     ierr = CeedCalloc(1, &impl); CeedChk(ierr);
630     op->data = impl;
631     op->Destroy = CeedOperatorDestroy_Magma;
632     op->Apply = CeedOperatorApply_Magma;
633     op->GetQData = CeedOperatorGetQData_Magma;
634     return 0;
635 }
636 
637 // *****************************************************************************
638 // * INIT
639 // *****************************************************************************
640 static int CeedInit_Magma(const char *resource, Ceed ceed) {
641     if (strcmp(resource, "/cpu/magma")
642         && strcmp(resource, "/gpu/magma"))
643         return CeedError(ceed, 1, "MAGMA backend cannot use resource: %s", resource);
644 
645     magma_init();
646     //magma_print_environment();
647 
648     ceed->VecCreate = CeedVectorCreate_Magma;
649     ceed->BasisCreateTensorH1 = CeedBasisCreateTensorH1_Magma;
650     ceed->ElemRestrictionCreate = CeedElemRestrictionCreate_Magma;
651     ceed->QFunctionCreate = CeedQFunctionCreate_Magma;
652     ceed->OperatorCreate = CeedOperatorCreate_Magma;
653     return 0;
654 }
655 
656 // *****************************************************************************
657 // * REGISTER
658 // *****************************************************************************
659 __attribute__((constructor))
660 static void Register(void) {
661     CeedRegister("/cpu/magma", CeedInit_Magma);
662     CeedRegister("/gpu/magma", CeedInit_Magma);
663 }
664