xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma.c (revision 2a8473592283e03ded13afe54b22c2648d2eb0c8)
1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
3 // reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include "ceed-magma.h"
18 #include <string.h>
19 
20 typedef struct {
21   CeedScalar *array;
22   CeedScalar *darray;
23   int  own_;
24   int down_;
25 } CeedVector_Magma;
26 
27 typedef struct {
28   CeedInt *indices;
29   CeedInt *dindices;
30   int  own_;
31   int down_;            // cover a case where we own Device memory
32 } CeedElemRestriction_Magma;
33 
34 typedef struct {
35   CeedVector etmp;
36   CeedVector qdata;
37 } CeedOperator_Magma;
38 
39 // *****************************************************************************
40 // * Initialize vector vec (after free mem) with values from array based on cmode
41 // *   CEED_COPY_VALUES: memory is allocated in vec->array_allocated, made equal
42 // *                     to array, and data is copied (not store passed pointer)
43 // *   CEED_OWN_POINTER: vec->data->array_allocated and vec->data->array = array
44 // *   CEED_USE_POINTER: vec->data->array = array (can modify; no ownership)
45 // * mtype: CEED_MEM_HOST or CEED_MEM_DEVICE
46 // *****************************************************************************
47 static int CeedVectorSetArray_Magma(CeedVector vec, CeedMemType mtype,
48                                     CeedCopyMode cmode, CeedScalar *array) {
49   CeedVector_Magma *impl = vec->data;
50   int ierr;
51 
52   // If own data, free the "old" data, e.g., as it may be of different size
53   if (impl->own_) {
54     magma_free( impl->darray );
55     magma_free_pinned( impl->array );
56     impl->darray = NULL;
57     impl->array  = NULL;
58     impl->own_ = 0;
59     impl->down_= 0;
60   }
61 
62   if (mtype == CEED_MEM_HOST) {
63     // memory is on the host; own_ = 0
64     switch (cmode) {
65     case CEED_COPY_VALUES:
66       ierr = magma_malloc( (void**)&impl->darray,
67                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
68       ierr = magma_malloc_pinned( (void**)&impl->array,
69                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
70       impl->own_ = 1;
71 
72       if (array != NULL)
73         magma_setvector(vec->length, sizeof(array[0]),
74                         array, 1, impl->darray, 1);
75       break;
76     case CEED_OWN_POINTER:
77       ierr = magma_malloc( (void**)&impl->darray,
78                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
79       // TODO: possible problem here is if we are passed non-pinned memory;
80       //       (as we own it, lter in destroy, we use free for pinned memory).
81       impl->array = array;
82       impl->own_ = 1;
83 
84       if (array != NULL)
85         magma_setvector(vec->length, sizeof(array[0]),
86                         array, 1, impl->darray, 1);
87       break;
88     case CEED_USE_POINTER:
89       ierr = magma_malloc( (void**)&impl->darray,
90                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
91       magma_setvector(vec->length, sizeof(array[0]),
92                       array, 1, impl->darray, 1);
93 
94       impl->down_  = 1;
95       impl->array  = array;
96     }
97   } else if (mtype == CEED_MEM_DEVICE) {
98     // memory is on the device; own = 0
99     switch (cmode) {
100     case CEED_COPY_VALUES:
101       ierr = magma_malloc( (void**)&impl->darray,
102                            vec->length * sizeof(CeedScalar)); CeedChk(ierr);
103       ierr = magma_malloc_pinned( (void**)&impl->array,
104                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
105       impl->own_ = 1;
106 
107       if (array)
108           magma_copyvector(vec->length, sizeof(array[0]),
109                            array, 1, impl->darray, 1);
110       else
111           // t30 assumes allocation initializes with 0s
112           magma_setvector(vec->length, sizeof(array[0]),
113                           impl->array, 1, impl->darray, 1);
114       break;
115     case CEED_OWN_POINTER:
116       impl->darray = array;
117       ierr = magma_malloc_pinned( (void**)&impl->array,
118                                   vec->length * sizeof(CeedScalar)); CeedChk(ierr);
119       impl->own_ = 1;
120 
121       break;
122     case CEED_USE_POINTER:
123       impl->darray = array;
124       impl->array  = NULL;
125     }
126 
127   } else
128     return CeedError(vec->ceed, 1, "Only MemType = HOST or DEVICE supported");
129 
130   return 0;
131 }
132 
133 // *****************************************************************************
134 // * Give data pointer from vector vec to array (on HOST or DEVICE)
135 // *****************************************************************************
136 static int CeedVectorGetArray_Magma(CeedVector vec, CeedMemType mtype,
137                                     CeedScalar **array) {
138   CeedVector_Magma *impl = vec->data;
139   int ierr;
140 
141   if (mtype == CEED_MEM_HOST) {
142     if (impl->own_) {
143         // data is owned so GPU had the most up-to-date version; copy it
144         // TTT - apparantly it doesn't have most up to date data
145         magma_getvector(vec->length, sizeof(*array[0]),
146                         impl->darray, 1, impl->array, 1);
147         CeedDebug("\033[31m[CeedVectorGetArray_Magma]");
148         //fprintf(stderr,"rrrrrrrrrrrrrrr\n");
149     } else if (impl->array == NULL) {
150       // Vector doesn't own the data and was set on GPU
151       if (impl->darray == NULL) {
152         // call was made just to allocate memory
153         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
154         CeedChk(ierr);
155       } else
156         return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
157     }
158     *array = impl->array;
159   } else if (mtype == CEED_MEM_DEVICE) {
160     if (impl->darray == NULL) {
161       // Vector doesn't own the data and was set on the CPU
162       if (impl->array == NULL) {
163         // call was made just to allocate memory
164         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
165         CeedChk(ierr);
166       } else
167         return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
168     }
169     *array = impl->darray;
170   } else
171     return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
172 
173   return 0;
174 }
175 
176 // *****************************************************************************
177 // * Give data pointer from vector vec to array (on HOST or DEVICE) to read it
178 // *****************************************************************************
179 static int CeedVectorGetArrayRead_Magma(CeedVector vec, CeedMemType mtype,
180                                         const CeedScalar **array) {
181   CeedVector_Magma *impl = vec->data;
182   int ierr;
183 
184   if (mtype == CEED_MEM_HOST) {
185     if (impl->own_) {
186       // data is owned so GPU had the most up-to-date version; copy it
187       magma_getvector(vec->length, sizeof(*array[0]),
188                       impl->darray, 1, impl->array, 1);
189     } else if (impl->array == NULL) {
190       // Vector doesn't own the data and was set on GPU
191       if (impl->darray == NULL) {
192         // call was made just to allocate memory
193         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
194         CeedChk(ierr);
195       } else
196         return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
197     }
198     *array = impl->array;
199   } else if (mtype == CEED_MEM_DEVICE) {
200     if (impl->darray == NULL) {
201       // Vector doesn't own the data and was set on the CPU
202       if (impl->array == NULL) {
203         // call was made just to allocate memory
204         ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
205         CeedChk(ierr);
206       } else
207         return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
208     }
209     *array = impl->darray;
210   } else
211     return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
212 
213   return 0;
214 }
215 
216 // *****************************************************************************
217 // * There is no mtype here for array so it is not clear if we restore from HOST
218 // * memory or from DEVICE memory. We assume that it is CPU memory because if
219 // * it was GPU memory we would not call this routine at all.
220 // * Restore vector vec with values from array, where array received its values
221 // * from vec and possibly modified them.
222 // *****************************************************************************
223 static int CeedVectorRestoreArray_Magma(CeedVector vec, CeedScalar **array) {
224   CeedVector_Magma *impl = vec->data;
225 
226   // Check if the array is a CPU pointer
227   if (*array == impl->array) {
228     // Update device, if the device pointer is not NULL
229     if (impl->darray != NULL) {
230       magma_setvector(vec->length, sizeof(*array[0]),
231                       *array, 1, impl->darray, 1);
232     } else {
233       // nothing to do (case of CPU use pointer)
234     }
235 
236   } else if (impl->down_) {
237     // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
238     magma_getvector(vec->length, sizeof(*array[0]),
239                     impl->darray, 1, impl->array, 1);
240   }
241 
242   *array = NULL;
243   return 0;
244 }
245 
246 // *****************************************************************************
247 // * There is no mtype here for array so it is not clear if we restore from HOST
248 // * memory or from DEVICE memory. We assume that it is CPU memory because if
249 // * it was GPU memory we would not call this routine at all.
250 // * Restore vector vec with values from array, where array received its values
251 // * from vec to only read them; in this case vec may have been modified meanwhile
252 // * and needs to be restored here.
253 // *****************************************************************************
254 static int CeedVectorRestoreArrayRead_Magma(CeedVector vec,
255     const CeedScalar **array) {
256   CeedVector_Magma *impl = vec->data;
257 
258   // Check if the array is a CPU pointer
259   if (*array == impl->array) {
260     // Update device, if the device pointer is not NULL
261     if (impl->darray != NULL) {
262       magma_setvector(vec->length, sizeof(*array[0]),
263                       *array, 1, impl->darray, 1);
264     } else {
265       // nothing to do (case of CPU use pointer)
266     }
267 
268   } else if (impl->down_) {
269     // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
270     magma_getvector(vec->length, sizeof(*array[0]),
271                     impl->darray, 1, impl->array, 1);
272   }
273 
274   *array = NULL;
275   return 0;
276 }
277 
278 static int CeedVectorDestroy_Magma(CeedVector vec) {
279   CeedVector_Magma *impl = vec->data;
280   int ierr;
281 
282   // Free if we own the data
283   if (impl->own_) {
284     ierr = magma_free_pinned(impl->array); CeedChk(ierr);
285     ierr = magma_free(impl->darray);       CeedChk(ierr);
286   } else if (impl->down_) {
287     ierr = magma_free(impl->darray);       CeedChk(ierr);
288   }
289   ierr = CeedFree(&vec->data); CeedChk(ierr);
290   return 0;
291 }
292 
293 // *****************************************************************************
294 // * Create vector vec of size n
295 // *****************************************************************************
296 static int CeedVectorCreate_Magma(Ceed ceed, CeedInt n, CeedVector vec) {
297   CeedVector_Magma *impl;
298   int ierr;
299 
300   vec->SetArray = CeedVectorSetArray_Magma;
301   vec->GetArray = CeedVectorGetArray_Magma;
302   vec->GetArrayRead = CeedVectorGetArrayRead_Magma;
303   vec->RestoreArray = CeedVectorRestoreArray_Magma;
304   vec->RestoreArrayRead = CeedVectorRestoreArrayRead_Magma;
305   vec->Destroy = CeedVectorDestroy_Magma;
306   ierr = CeedCalloc(1,&impl); CeedChk(ierr);
307   impl->darray = NULL;
308   impl->array  = NULL;
309   impl->own_ = 0;
310   impl->down_= 0;
311   vec->data = impl;
312   return 0;
313 }
314 
315 // *****************************************************************************
316 // * Apply restriction operator r to u: v = r(rmode) u
317 // *****************************************************************************
318 static int CeedElemRestrictionApply_Magma(CeedElemRestriction r,
319     CeedTransposeMode tmode, CeedInt ncomp,
320     CeedTransposeMode lmode, CeedVector u,
321     CeedVector v, CeedRequest *request) {
322   CeedElemRestriction_Magma *impl = r->data;
323   int ierr;
324   const CeedScalar *uu;
325   CeedScalar *vv;
326   CeedInt nelem = r->nelem, elemsize = r->elemsize, ndof = r->ndof;
327   CeedInt esize = nelem * elemsize;
328 
329   #ifdef USE_MAGMA_BATCH2
330   CeedInt *dindices = impl->dindices;
331   // Get pointers on the device
332   ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &uu); CeedChk(ierr);
333   ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &vv); CeedChk(ierr);
334   #else
335   CeedInt *indices = impl->indices;
336   ierr = CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu); CeedChk(ierr);
337   ierr = CeedVectorGetArray(v, CEED_MEM_HOST, &vv); CeedChk(ierr);
338   #endif
339 
340   if (tmode == CEED_NOTRANSPOSE) {
341     // Perform: v = r * u
342     if (ncomp == 1) {
343         #ifdef USE_MAGMA_BATCH2
344             magma_template<<i=0:esize>>
345                 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
346                     vv[i] = uu[dindices[i]];
347             }
348         #else
349            for (CeedInt i=0; i<esize; i++) vv[i] = uu[indices[i]];
350         #endif
351     } else {
352       // vv is (elemsize x ncomp x nelem), column-major
353       if (lmode == CEED_NOTRANSPOSE) { // u is (ndof x ncomp), column-major
354           #ifdef USE_MAGMA_BATCH2
355           magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
356               (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, int ndof) {
357               vv[i + iend*(d+dend*e)] = uu[dindices[i+iend*e]+ndof*d];
358           }
359           #else
360           for (CeedInt e = 0; e < nelem; e++)
361               for (CeedInt d = 0; d < ncomp; d++)
362                   for (CeedInt i=0; i < elemsize; i++) {
363                       vv[i + elemsize*(d+ncomp*e)] =
364                           uu[indices[i+elemsize*e]+ndof*d];
365                   }
366           #endif
367       } else { // u is (ncomp x ndof), column-major
368           #ifdef USE_MAGMA_BATCH2
369           magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
370               (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
371               vv[i + iend*(d+dend*e)] = uu[d+dend*dindices[i + iend*e]];
372           }
373           #else
374           for (CeedInt e = 0; e < nelem; e++)
375               for (CeedInt d = 0; d < ncomp; d++)
376                   for (CeedInt i=0; i< elemsize; i++) {
377                       vv[i + elemsize*(d+ncomp*e)] =
378                           uu[d+ncomp*indices[i+elemsize*e]];
379                   }
380           #endif
381       }
382     }
383   } else {
384     // Note: in transpose mode, we perform: v += r^t * u
385     if (ncomp == 1) {
386         // fprintf(stderr,"3 ---------\n");
387         #ifdef USE_MAGMA_BATCH2
388         magma_template<<i=0:esize>>
389             (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
390             magmablas_datomic_add( &vv[dindices[i]], uu[i]);
391         }
392         #else
393         for (CeedInt i=0; i<esize; i++) vv[indices[i]] += uu[i];
394         #endif
395     } else { // u is (elemsize x ncomp x nelem)
396         fprintf(stderr,"2 ---------\n");
397 
398         if (lmode == CEED_NOTRANSPOSE) { // vv is (ndof x ncomp), column-major
399             #ifdef USE_MAGMA_BATCH2
400             magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
401                 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, CeedInt ndof) {
402                 magmablas_datomic_add( &vv[dindices[i+iend*e]+ndof*d],
403                                        uu[i+iend*(d+e*dend)]);
404             }
405             #else
406             for (CeedInt e = 0; e < nelem; e++)
407                 for (CeedInt d = 0; d < ncomp; d++)
408                     for (CeedInt i=0; i < elemsize; i++) {
409                         vv[indices[i + elemsize*e]+ndof*d] +=
410                             uu[i + elemsize*(d+e*ncomp)];
411                     }
412             #endif
413         } else { // vv is (ncomp x ndof), column-major
414             #ifdef USE_MAGMA_BATCH2
415             magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
416                 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
417                 magmablas_datomic_add( &vv[d+dend*dindices[i + iend*e]],
418                                        uu[i+iend*(d+e*dend)]);
419             }
420             #else
421             for (CeedInt e = 0; e < nelem; e++)
422                 for (CeedInt d = 0; d < ncomp; d++)
423                     for (CeedInt i=0; i < elemsize; i++) {
424                         vv[d+ncomp*indices[i + elemsize*e]] +=
425                             uu[i + elemsize*(d+e*ncomp)];
426                     }
427             #endif
428         }
429     }
430   }
431 
432   ierr = CeedVectorRestoreArrayRead(u, &uu); CeedChk(ierr);
433   ierr = CeedVectorRestoreArray(v, &vv); CeedChk(ierr);
434 
435   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
436     *request = NULL;
437   return 0;
438 }
439 
440 static int CeedElemRestrictionDestroy_Magma(CeedElemRestriction r) {
441   CeedElemRestriction_Magma *impl = r->data;
442   int ierr;
443 
444   // Free if we own the data
445   if (impl->own_) {
446     ierr = magma_free_pinned(impl->indices); CeedChk(ierr);
447     ierr = magma_free(impl->dindices);       CeedChk(ierr);
448   } else if (impl->down_) {
449     ierr = magma_free(impl->dindices);       CeedChk(ierr);
450   }
451   ierr = CeedFree(&r->data); CeedChk(ierr);
452   return 0;
453 }
454 
455 static int CeedElemRestrictionCreate_Magma(CeedElemRestriction r,
456     CeedMemType mtype,
457     CeedCopyMode cmode,
458     const CeedInt *indices) {
459   int ierr, size = r->nelem*r->elemsize;
460   CeedElemRestriction_Magma *impl;
461 
462   // Allocate memory for the MAGMA Restricton and initializa pointers to NULL
463   ierr = CeedCalloc(1,&impl); CeedChk(ierr);
464   impl->dindices = NULL;
465   impl->indices  = NULL;
466   impl->own_ = 0;
467   impl->down_= 0;
468 
469   if (mtype == CEED_MEM_HOST) {
470     // memory is on the host; own_ = 0
471     switch (cmode) {
472     case CEED_COPY_VALUES:
473       ierr = magma_malloc( (void**)&impl->dindices,
474                            size * sizeof(CeedInt)); CeedChk(ierr);
475       ierr = magma_malloc_pinned( (void**)&impl->indices,
476                                   size * sizeof(CeedInt)); CeedChk(ierr);
477       impl->own_ = 1;
478 
479       if (indices != NULL) {
480           memcpy(impl->indices, indices, size * sizeof(indices[0]));
481           magma_setvector(size, sizeof(CeedInt),
482                           impl->indices, 1, impl->dindices, 1);
483       }
484       break;
485     case CEED_OWN_POINTER:
486       ierr = magma_malloc( (void**)&impl->dindices,
487                            size * sizeof(CeedInt)); CeedChk(ierr);
488       // TODO: possible problem here is if we are passed non-pinned memory;
489       //       (as we own it, lter in destroy, we use free for pinned memory).
490       impl->indices = (CeedInt *)indices;
491       impl->own_ = 1;
492 
493       if (indices != NULL)
494         magma_setvector(size, sizeof(CeedInt),
495                         indices, 1, impl->dindices, 1);
496       break;
497     case CEED_USE_POINTER:
498       ierr = magma_malloc( (void**)&impl->dindices,
499                            size * sizeof(CeedInt)); CeedChk(ierr);
500       magma_setvector(size, sizeof(CeedInt),
501                       indices, 1, impl->dindices, 1);
502       impl->down_ = 1;
503       impl->indices  = (CeedInt *)indices;
504     }
505   } else if (mtype == CEED_MEM_DEVICE) {
506     // memory is on the device; own = 0
507     switch (cmode) {
508     case CEED_COPY_VALUES:
509       ierr = magma_malloc( (void**)&impl->dindices,
510                            size * sizeof(CeedInt)); CeedChk(ierr);
511       ierr = magma_malloc_pinned( (void**)&impl->indices,
512                                   size * sizeof(CeedInt)); CeedChk(ierr);
513       impl->own_ = 1;
514 
515       if (indices)
516         magma_copyvector(size, sizeof(CeedInt),
517                          indices, 1, impl->dindices, 1);
518       break;
519     case CEED_OWN_POINTER:
520       impl->dindices = (CeedInt *)indices;
521       ierr = magma_malloc_pinned( (void**)&impl->indices,
522                                   size * sizeof(CeedInt)); CeedChk(ierr);
523       impl->own_ = 1;
524 
525       break;
526     case CEED_USE_POINTER:
527       impl->dindices = (CeedInt *)indices;
528       impl->indices  = NULL;
529     }
530 
531   } else
532     return CeedError(r->ceed, 1, "Only MemType = HOST or DEVICE supported");
533 
534   r->data    = impl;
535   r->Apply   = CeedElemRestrictionApply_Magma;
536   r->Destroy = CeedElemRestrictionDestroy_Magma;
537 
538   return 0;
539 }
540 
541 // Contracts on the middle index
542 // NOTRANSPOSE: V_ajc = T_jb U_abc
543 // TRANSPOSE:   V_ajc = T_bj U_abc
544 // If Add != 0, "=" is replaced by "+="
545 static int CeedTensorContract_Magma(Ceed ceed,
546                                     CeedInt A, CeedInt B, CeedInt C, CeedInt J,
547                                     const CeedScalar *t, CeedTransposeMode tmode,
548                                     const CeedInt Add,
549                                     const CeedScalar *u, CeedScalar *v) {
550 #ifdef USE_MAGMA_BATCH
551   magma_dtensor_contract(ceed, A, B, C, J, t, tmode, Add, u, v);
552 #else
553   CeedInt tstride0 = B, tstride1 = 1;
554   if (tmode == CEED_TRANSPOSE) {
555     tstride0 = 1; tstride1 = J;
556   }
557   CeedDebug("\033[31m[CeedTensorContract] A=%d, J=%d, C=%d, B=%d: %d %d %d",
558             A,J,C,B,A*J*B*C, C*J*A, C*B*A);
559   for (CeedInt a=0; a<A; a++) {
560     for (CeedInt j=0; j<J; j++) {
561       if (!Add) {
562         for (CeedInt c=0; c<C; c++)
563           v[(a*J+j)*C+c] = 0;
564       }
565       for (CeedInt b=0; b<B; b++) {
566         for (CeedInt c=0; c<C; c++) {
567           v[(a*J+j)*C+c] += t[j*tstride0 + b*tstride1] * u[(a*B+b)*C+c];
568         }
569       }
570     }
571   }
572 #endif
573   return 0;
574 }
575 
576 static int CeedBasisApply_Magma(CeedBasis basis, CeedTransposeMode tmode,
577                                 CeedEvalMode emode,
578                                 const CeedScalar *u, CeedScalar *v) {
579   int ierr;
580   const CeedInt dim = basis->dim;
581   const CeedInt ndof = basis->ndof;
582   const CeedInt nqpt = ndof*CeedPowInt(basis->Q1d, dim);
583   const CeedInt add = (tmode == CEED_TRANSPOSE);
584 
585   CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d",
586             ndof*CeedPowInt(basis->P1d, dim));
587 
588   if (tmode == CEED_TRANSPOSE) {
589     const CeedInt vsize = ndof*CeedPowInt(basis->P1d, dim);
590     for (CeedInt i = 0; i < vsize; i++)
591       v[i] = (CeedScalar) 0;
592   }
593   if (emode & CEED_EVAL_INTERP) {
594     CeedInt P = basis->P1d, Q = basis->Q1d;
595     if (tmode == CEED_TRANSPOSE) {
596       P = basis->Q1d; Q = basis->P1d;
597     }
598     CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1;
599     CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)];
600     CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
601               ndof*Q*CeedPowInt(P>Q?P:Q, dim-1));
602     for (CeedInt d=0; d<dim; d++) {
603       ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, basis->interp1d,
604                                       tmode, add&&(d==dim-1),
605                                       d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
606       CeedChk(ierr);
607       pre /= P;
608       post *= Q;
609     }
610     if (tmode == CEED_NOTRANSPOSE) {
611       v += nqpt;
612     } else {
613       u += nqpt;
614     }
615   }
616   if (emode & CEED_EVAL_GRAD) {
617     CeedInt P = basis->P1d, Q = basis->Q1d;
618     // In CEED_NOTRANSPOSE mode:
619     // u is (P^dim x nc), column-major layout (nc = ndof)
620     // v is (Q^dim x nc x dim), column-major layout (nc = ndof)
621     // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
622     if (tmode == CEED_TRANSPOSE) {
623       P = basis->Q1d, Q = basis->P1d;
624     }
625     CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)];
626     CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
627               ndof*Q*CeedPowInt(P>Q?P:Q, dim-1));
628     for (CeedInt p = 0; p < dim; p++) {
629       CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1;
630       for (CeedInt d=0; d<dim; d++) {
631         ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q,
632                                         (p==d)?basis->grad1d:basis->interp1d,
633                                         tmode, add&&(d==dim-1),
634                                         d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
635         CeedChk(ierr);
636         pre /= P;
637         post *= Q;
638       }
639       if (tmode == CEED_NOTRANSPOSE) {
640         v += nqpt;
641       } else {
642         u += nqpt;
643       }
644     }
645   }
646   if (emode & CEED_EVAL_WEIGHT) {
647     if (tmode == CEED_TRANSPOSE)
648       return CeedError(basis->ceed, 1,
649                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
650     CeedInt Q = basis->Q1d;
651     for (CeedInt d=0; d<dim; d++) {
652       CeedInt pre = CeedPowInt(Q, dim-d-1), post = CeedPowInt(Q, d);
653       for (CeedInt i=0; i<pre; i++) {
654         for (CeedInt j=0; j<Q; j++) {
655           for (CeedInt k=0; k<post; k++) {
656             v[(i*Q + j)*post + k] = basis->qweight1d[j]
657                                     * (d == 0 ? 1 : v[(i*Q + j)*post + k]);
658           }
659         }
660       }
661     }
662   }
663   return 0;
664 }
665 
666 static int CeedBasisDestroy_Magma(CeedBasis basis) {
667   return 0;
668 }
669 
670 static int CeedBasisCreateTensorH1_Magma(Ceed ceed, CeedInt dim, CeedInt P1d,
671     CeedInt Q1d, const CeedScalar *interp1d,
672     const CeedScalar *grad1d,
673     const CeedScalar *qref1d,
674     const CeedScalar *qweight1d,
675     CeedBasis basis) {
676   basis->Apply = CeedBasisApply_Magma;
677   basis->Destroy = CeedBasisDestroy_Magma;
678   return 0;
679 }
680 
681 static int CeedQFunctionApply_Magma(CeedQFunction qf, void *qdata, CeedInt Q,
682                                     const CeedScalar *const *u,
683                                     CeedScalar *const *v) {
684   int ierr;
685   ierr = qf->function(qf->ctx, qdata, Q, u, v); CeedChk(ierr);
686   return 0;
687 }
688 
689 static int CeedQFunctionDestroy_Magma(CeedQFunction qf) {
690   return 0;
691 }
692 
693 static int CeedQFunctionCreate_Magma(CeedQFunction qf) {
694   qf->Apply = CeedQFunctionApply_Magma;
695   qf->Destroy = CeedQFunctionDestroy_Magma;
696   return 0;
697 }
698 
699 static int CeedOperatorDestroy_Magma(CeedOperator op) {
700   CeedOperator_Magma *impl = op->data;
701   int ierr;
702 
703   ierr = CeedVectorDestroy(&impl->etmp); CeedChk(ierr);
704   ierr = CeedVectorDestroy(&impl->qdata); CeedChk(ierr);
705   ierr = CeedFree(&op->data); CeedChk(ierr);
706   return 0;
707 }
708 
709 static int CeedOperatorApply_Magma(CeedOperator op, CeedVector qdata,
710                                    CeedVector ustate,
711                                    CeedVector residual, CeedRequest *request) {
712   CeedOperator_Magma *impl = op->data;
713   CeedVector etmp;
714   CeedInt Q;
715   const CeedInt nc = op->basis->ndof, dim = op->basis->dim;
716   CeedScalar *Eu;
717   char *qd;
718   int ierr;
719   CeedTransposeMode lmode = CEED_NOTRANSPOSE;
720 
721   if (!impl->etmp) {
722     ierr = CeedVectorCreate(op->ceed,
723                             nc * op->Erestrict->nelem * op->Erestrict->elemsize,
724                             &impl->etmp); CeedChk(ierr);
725     // etmp is allocated when CeedVectorGetArray is called below
726   }
727   etmp = impl->etmp;
728   if (op->qf->inmode & ~CEED_EVAL_WEIGHT) {
729     ierr = CeedElemRestrictionApply(op->Erestrict, CEED_NOTRANSPOSE,
730                                     nc, lmode, ustate, etmp,
731                                     CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
732   }
733   ierr = CeedBasisGetNumQuadraturePoints(op->basis, &Q); CeedChk(ierr);
734   ierr = CeedVectorGetArray(etmp, CEED_MEM_HOST, &Eu); CeedChk(ierr);
735   ierr = CeedVectorGetArray(qdata, CEED_MEM_HOST, (CeedScalar**)&qd);
736   CeedChk(ierr);
737 
738   for (CeedInt e=0; e<op->Erestrict->nelem; e++) {
739     CeedScalar BEu[Q*nc*(dim+2)], BEv[Q*nc*(dim+2)], *out[5] = {0,0,0,0,0};
740     const CeedScalar *in[5] = {0,0,0,0,0};
741     // TODO: quadrature weights can be computed just once
742     CeedDebug("\033[11m[CeedOperatorApply_Magma] e=%d: Eu+%d, %d",
743               e, e*op->Erestrict->elemsize*nc, Q*nc*(dim+2));
744     ierr = CeedBasisApply(op->basis, CEED_NOTRANSPOSE, op->qf->inmode,
745                           &Eu[e*op->Erestrict->elemsize*nc], BEu);
746     CeedChk(ierr);
747     CeedScalar *u_ptr = BEu, *v_ptr = BEv;
748     if (op->qf->inmode & CEED_EVAL_INTERP) { in[0] = u_ptr; u_ptr += Q*nc; }
749     if (op->qf->inmode & CEED_EVAL_GRAD) { in[1] = u_ptr; u_ptr += Q*nc*dim; }
750     if (op->qf->inmode & CEED_EVAL_WEIGHT) { in[4] = u_ptr; u_ptr += Q; }
751     if (op->qf->outmode & CEED_EVAL_INTERP) { out[0] = v_ptr; v_ptr += Q*nc; }
752     if (op->qf->outmode & CEED_EVAL_GRAD) { out[1] = v_ptr; v_ptr += Q*nc*dim; }
753     ierr = CeedQFunctionApply(op->qf, &qd[e*Q*op->qf->qdatasize], Q, in, out);
754     CeedChk(ierr);
755     CeedDebug("\033[31m[CeedOperatorApply_Magma] e=%d: ",e);
756     ierr = CeedBasisApply(op->basis, CEED_TRANSPOSE, op->qf->outmode, BEv,
757                           &Eu[e*op->Erestrict->elemsize*nc]);
758     CeedChk(ierr);
759   }
760   ierr = CeedVectorRestoreArray(etmp, &Eu); CeedChk(ierr);
761   // qdata must be restored
762   ierr = CeedVectorRestoreArray(qdata, (CeedScalar**)&qd); CeedChk(ierr);
763   if (residual) {
764     CeedScalar *res;
765     CeedVectorGetArray(residual, CEED_MEM_HOST, &res);
766     for (int i = 0; i < residual->length; i++)
767       res[i] = (CeedScalar)0;
768     // residual must be restored
769     CeedVectorRestoreArray(residual, &res);
770     ierr = CeedElemRestrictionApply(op->Erestrict, CEED_TRANSPOSE,
771                                     nc, lmode, etmp, residual,
772                                     CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
773   }
774   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
775     *request = NULL;
776   return 0;
777 }
778 
779 static int CeedOperatorGetQData_Magma(CeedOperator op, CeedVector *qdata) {
780   CeedOperator_Magma *impl = op->data;
781   int ierr;
782 
783   if (!impl->qdata) {
784     CeedInt Q;
785     ierr = CeedBasisGetNumQuadraturePoints(op->basis, &Q); CeedChk(ierr);
786     ierr = CeedVectorCreate(op->ceed,
787                             op->Erestrict->nelem * Q
788                             * op->qf->qdatasize / sizeof(CeedScalar),
789                             &impl->qdata); CeedChk(ierr);
790   }
791   *qdata = impl->qdata;
792   return 0;
793 }
794 
795 static int CeedOperatorCreate_Magma(CeedOperator op) {
796   CeedOperator_Magma *impl;
797   int ierr;
798 
799   ierr = CeedCalloc(1, &impl); CeedChk(ierr);
800   op->data = impl;
801   op->Destroy  = CeedOperatorDestroy_Magma;
802   op->Apply    = CeedOperatorApply_Magma;
803   op->GetQData = CeedOperatorGetQData_Magma;
804   return 0;
805 }
806 
807 // *****************************************************************************
808 // * INIT
809 // *****************************************************************************
810 static int CeedInit_Magma(const char *resource, Ceed ceed) {
811   if (strcmp(resource, "/gpu/magma"))
812     return CeedError(ceed, 1, "MAGMA backend cannot use resource: %s", resource);
813 
814   magma_init();
815   //magma_print_environment();
816 
817   ceed->VecCreate = CeedVectorCreate_Magma;
818   ceed->BasisCreateTensorH1 = CeedBasisCreateTensorH1_Magma;
819   ceed->ElemRestrictionCreate = CeedElemRestrictionCreate_Magma;
820   ceed->QFunctionCreate = CeedQFunctionCreate_Magma;
821   ceed->OperatorCreate = CeedOperatorCreate_Magma;
822   return 0;
823 }
824 
825 // *****************************************************************************
826 // * REGISTER
827 // *****************************************************************************
828 __attribute__((constructor))
829 static void Register(void) {
830   CeedRegister("/gpu/magma", CeedInit_Magma);
831 }
832