xref: /libCEED/backends/hip-ref/ceed-hip-ref-operator.c (revision 1d3409f0cc39a9d8faa73ea0544bcc2cb6d52870) !
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed/ceed.h>
18 #include <ceed/backend.h>
19 #include <hip/hip_runtime.h>
20 #include <assert.h>
21 #include <stdbool.h>
22 #include <string.h>
23 #include "ceed-hip-ref.h"
24 #include "../hip/ceed-hip-compile.h"
25 
26 //------------------------------------------------------------------------------
27 // Destroy operator
28 //------------------------------------------------------------------------------
29 static int CeedOperatorDestroy_Hip(CeedOperator op) {
30   int ierr;
31   CeedOperator_Hip *impl;
32   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
33 
34   // Apply data
35   for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) {
36     ierr = CeedVectorDestroy(&impl->evecs[i]); CeedChkBackend(ierr);
37   }
38   ierr = CeedFree(&impl->evecs); CeedChkBackend(ierr);
39 
40   for (CeedInt i = 0; i < impl->numein; i++) {
41     ierr = CeedVectorDestroy(&impl->qvecsin[i]); CeedChkBackend(ierr);
42   }
43   ierr = CeedFree(&impl->qvecsin); CeedChkBackend(ierr);
44 
45   for (CeedInt i = 0; i < impl->numeout; i++) {
46     ierr = CeedVectorDestroy(&impl->qvecsout[i]); CeedChkBackend(ierr);
47   }
48   ierr = CeedFree(&impl->qvecsout); CeedChkBackend(ierr);
49 
50   // QFunction diagonal assembly data
51   for (CeedInt i=0; i<impl->qfnumactivein; i++) {
52     ierr = CeedVectorDestroy(&impl->qfactivein[i]); CeedChkBackend(ierr);
53   }
54   ierr = CeedFree(&impl->qfactivein); CeedChkBackend(ierr);
55 
56   // Diag data
57   if (impl->diag) {
58     Ceed ceed;
59     ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
60     CeedChk_Hip(ceed, hipModuleUnload(impl->diag->module));
61     ierr = CeedFree(&impl->diag->h_emodein); CeedChkBackend(ierr);
62     ierr = CeedFree(&impl->diag->h_emodeout); CeedChkBackend(ierr);
63     ierr = hipFree(impl->diag->d_emodein); CeedChk_Hip(ceed, ierr);
64     ierr = hipFree(impl->diag->d_emodeout); CeedChk_Hip(ceed, ierr);
65     ierr = hipFree(impl->diag->d_identity); CeedChk_Hip(ceed, ierr);
66     ierr = hipFree(impl->diag->d_interpin); CeedChk_Hip(ceed, ierr);
67     ierr = hipFree(impl->diag->d_interpout); CeedChk_Hip(ceed, ierr);
68     ierr = hipFree(impl->diag->d_gradin); CeedChk_Hip(ceed, ierr);
69     ierr = hipFree(impl->diag->d_gradout); CeedChk_Hip(ceed, ierr);
70     ierr = CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr);
71     CeedChkBackend(ierr);
72     ierr = CeedVectorDestroy(&impl->diag->elemdiag); CeedChkBackend(ierr);
73     ierr = CeedVectorDestroy(&impl->diag->pbelemdiag); CeedChkBackend(ierr);
74   }
75   ierr = CeedFree(&impl->diag); CeedChkBackend(ierr);
76 
77   if (impl->asmb) {
78     Ceed ceed;
79     ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
80     CeedChk_Hip(ceed, hipModuleUnload(impl->asmb->module));
81     ierr = hipFree(impl->asmb->d_B_in); CeedChk_Hip(ceed, ierr);
82     ierr = hipFree(impl->asmb->d_B_out); CeedChk_Hip(ceed, ierr);
83   }
84   ierr = CeedFree(&impl->asmb); CeedChkBackend(ierr);
85 
86   ierr = CeedFree(&impl); CeedChkBackend(ierr);
87   return CEED_ERROR_SUCCESS;
88 }
89 
90 //------------------------------------------------------------------------------
91 // Setup infields or outfields
92 //------------------------------------------------------------------------------
93 static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op,
94                                        bool isinput, CeedVector *evecs,
95                                        CeedVector *qvecs, CeedInt starte,
96                                        CeedInt numfields, CeedInt Q,
97                                        CeedInt numelements) {
98   CeedInt dim, ierr, size;
99   Ceed ceed;
100   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
101   CeedBasis basis;
102   CeedElemRestriction Erestrict;
103   CeedOperatorField *opfields;
104   CeedQFunctionField *qffields;
105   CeedVector fieldvec;
106   bool strided;
107   bool skiprestrict;
108 
109   if (isinput) {
110     ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL);
111     CeedChkBackend(ierr);
112     ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL);
113     CeedChkBackend(ierr);
114   } else {
115     ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields);
116     CeedChkBackend(ierr);
117     ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields);
118     CeedChkBackend(ierr);
119   }
120 
121   // Loop over fields
122   for (CeedInt i = 0; i < numfields; i++) {
123     CeedEvalMode emode;
124     ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr);
125 
126     strided = false;
127     skiprestrict = false;
128     if (emode != CEED_EVAL_WEIGHT) {
129       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict);
130       CeedChkBackend(ierr);
131 
132       // Check whether this field can skip the element restriction:
133       // must be passive input, with emode NONE, and have a strided restriction with
134       // CEED_STRIDES_BACKEND.
135 
136       // First, check whether the field is input or output:
137       if (isinput) {
138         // Check for passive input:
139         ierr = CeedOperatorFieldGetVector(opfields[i], &fieldvec); CeedChkBackend(ierr);
140         if (fieldvec != CEED_VECTOR_ACTIVE) {
141           // Check emode
142           if (emode == CEED_EVAL_NONE) {
143             // Check for strided restriction
144             ierr = CeedElemRestrictionIsStrided(Erestrict, &strided);
145             CeedChkBackend(ierr);
146             if (strided) {
147               // Check if vector is already in preferred backend ordering
148               ierr = CeedElemRestrictionHasBackendStrides(Erestrict,
149                      &skiprestrict); CeedChkBackend(ierr);
150             }
151           }
152         }
153       }
154       if (skiprestrict) {
155         // We do not need an E-Vector, but will use the input field vector's data
156         // directly in the operator application.
157         evecs[i + starte] = NULL;
158       } else {
159         ierr = CeedElemRestrictionCreateVector(Erestrict, NULL,
160                                                &evecs[i + starte]);
161         CeedChkBackend(ierr);
162       }
163     }
164 
165     switch (emode) {
166     case CEED_EVAL_NONE:
167       ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr);
168       ierr = CeedVectorCreate(ceed, numelements * Q * size, &qvecs[i]);
169       CeedChkBackend(ierr);
170       break;
171     case CEED_EVAL_INTERP:
172       ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr);
173       ierr = CeedVectorCreate(ceed, numelements * Q * size, &qvecs[i]);
174       CeedChkBackend(ierr);
175       break;
176     case CEED_EVAL_GRAD:
177       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr);
178       ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr);
179       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
180       ierr = CeedVectorCreate(ceed, numelements * Q * size, &qvecs[i]);
181       CeedChkBackend(ierr);
182       break;
183     case CEED_EVAL_WEIGHT: // Only on input fields
184       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr);
185       ierr = CeedVectorCreate(ceed, numelements * Q, &qvecs[i]); CeedChkBackend(ierr);
186       ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE,
187                             CEED_EVAL_WEIGHT, NULL, qvecs[i]); CeedChkBackend(ierr);
188       break;
189     case CEED_EVAL_DIV:
190       break; // TODO: Not implemented
191     case CEED_EVAL_CURL:
192       break; // TODO: Not implemented
193     }
194   }
195   return CEED_ERROR_SUCCESS;
196 }
197 
198 //------------------------------------------------------------------------------
199 // CeedOperator needs to connect all the named fields (be they active or passive)
200 //   to the named inputs and outputs of its CeedQFunction.
201 //------------------------------------------------------------------------------
202 static int CeedOperatorSetup_Hip(CeedOperator op) {
203   int ierr;
204   bool setupdone;
205   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
206   if (setupdone)
207     return CEED_ERROR_SUCCESS;
208   Ceed ceed;
209   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
210   CeedOperator_Hip *impl;
211   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
212   CeedQFunction qf;
213   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
214   CeedInt Q, numelements, numinputfields, numoutputfields;
215   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
216   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
217   CeedOperatorField *opinputfields, *opoutputfields;
218   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
219                                &numoutputfields, &opoutputfields);
220   CeedChkBackend(ierr);
221   CeedQFunctionField *qfinputfields, *qfoutputfields;
222   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
223   CeedChkBackend(ierr);
224 
225   // Allocate
226   ierr = CeedCalloc(numinputfields + numoutputfields, &impl->evecs);
227   CeedChkBackend(ierr);
228 
229   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin); CeedChkBackend(ierr);
230   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout); CeedChkBackend(ierr);
231 
232   impl->numein = numinputfields; impl->numeout = numoutputfields;
233 
234   // Set up infield and outfield evecs and qvecs
235   // Infields
236   ierr = CeedOperatorSetupFields_Hip(qf, op, true,
237                                      impl->evecs, impl->qvecsin, 0,
238                                      numinputfields, Q, numelements);
239   CeedChkBackend(ierr);
240 
241   // Outfields
242   ierr = CeedOperatorSetupFields_Hip(qf, op, false,
243                                      impl->evecs, impl->qvecsout,
244                                      numinputfields, numoutputfields, Q,
245                                      numelements); CeedChkBackend(ierr);
246 
247   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
248   return CEED_ERROR_SUCCESS;
249 }
250 
251 //------------------------------------------------------------------------------
252 // Setup Operator Inputs
253 //------------------------------------------------------------------------------
254 static inline int CeedOperatorSetupInputs_Hip(CeedInt numinputfields,
255     CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
256     CeedVector invec, const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX],
257     CeedOperator_Hip *impl, CeedRequest *request) {
258   CeedInt ierr;
259   CeedEvalMode emode;
260   CeedVector vec;
261   CeedElemRestriction Erestrict;
262 
263   for (CeedInt i = 0; i < numinputfields; i++) {
264     // Get input vector
265     ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
266     if (vec == CEED_VECTOR_ACTIVE) {
267       if (skipactive)
268         continue;
269       else
270         vec = invec;
271     }
272 
273     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
274     CeedChkBackend(ierr);
275     if (emode == CEED_EVAL_WEIGHT) { // Skip
276     } else {
277       // Get input vector
278       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
279       // Get input element restriction
280       ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
281       CeedChkBackend(ierr);
282       if (vec == CEED_VECTOR_ACTIVE)
283         vec = invec;
284       // Restrict, if necessary
285       if (!impl->evecs[i]) {
286         // No restriction for this field; read data directly from vec.
287         ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE,
288                                       (const CeedScalar **) &edata[i]);
289         CeedChkBackend(ierr);
290       } else {
291         ierr = CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec,
292                                         impl->evecs[i], request); CeedChkBackend(ierr);
293         // Get evec
294         ierr = CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE,
295                                       (const CeedScalar **) &edata[i]);
296         CeedChkBackend(ierr);
297       }
298     }
299   }
300   return CEED_ERROR_SUCCESS;
301 }
302 
303 //------------------------------------------------------------------------------
304 // Input Basis Action
305 //------------------------------------------------------------------------------
306 static inline int CeedOperatorInputBasis_Hip(CeedInt numelements,
307     CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
308     CeedInt numinputfields, const bool skipactive,
309     CeedScalar *edata[2*CEED_FIELD_MAX], CeedOperator_Hip *impl) {
310   CeedInt ierr;
311   CeedInt elemsize, size;
312   CeedElemRestriction Erestrict;
313   CeedEvalMode emode;
314   CeedBasis basis;
315 
316   for (CeedInt i=0; i<numinputfields; i++) {
317     // Skip active input
318     if (skipactive) {
319       CeedVector vec;
320       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
321       if (vec == CEED_VECTOR_ACTIVE)
322         continue;
323     }
324     // Get elemsize, emode, size
325     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
326     CeedChkBackend(ierr);
327     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
328     CeedChkBackend(ierr);
329     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
330     CeedChkBackend(ierr);
331     ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr);
332     // Basis action
333     switch (emode) {
334     case CEED_EVAL_NONE:
335       ierr = CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE,
336                                 CEED_USE_POINTER, edata[i]); CeedChkBackend(ierr);
337       break;
338     case CEED_EVAL_INTERP:
339       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis);
340       CeedChkBackend(ierr);
341       ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE,
342                             CEED_EVAL_INTERP, impl->evecs[i],
343                             impl->qvecsin[i]); CeedChkBackend(ierr);
344       break;
345     case CEED_EVAL_GRAD:
346       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis);
347       CeedChkBackend(ierr);
348       ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE,
349                             CEED_EVAL_GRAD, impl->evecs[i],
350                             impl->qvecsin[i]); CeedChkBackend(ierr);
351       break;
352     case CEED_EVAL_WEIGHT:
353       break; // No action
354     case CEED_EVAL_DIV:
355       break; // TODO: Not implemented
356     case CEED_EVAL_CURL:
357       break; // TODO: Not implemented
358     }
359   }
360   return CEED_ERROR_SUCCESS;
361 }
362 
363 //------------------------------------------------------------------------------
364 // Restore Input Vectors
365 //------------------------------------------------------------------------------
366 static inline int CeedOperatorRestoreInputs_Hip(CeedInt numinputfields,
367     CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
368     const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX],
369     CeedOperator_Hip *impl) {
370   CeedInt ierr;
371   CeedEvalMode emode;
372   CeedVector vec;
373 
374   for (CeedInt i = 0; i < numinputfields; i++) {
375     // Skip active input
376     if (skipactive) {
377       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
378       if (vec == CEED_VECTOR_ACTIVE)
379         continue;
380     }
381     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
382     CeedChkBackend(ierr);
383     if (emode == CEED_EVAL_WEIGHT) { // Skip
384     } else {
385       if (!impl->evecs[i]) {  // This was a skiprestrict case
386         ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
387         ierr = CeedVectorRestoreArrayRead(vec,
388                                           (const CeedScalar **)&edata[i]);
389         CeedChkBackend(ierr);
390       } else {
391         ierr = CeedVectorRestoreArrayRead(impl->evecs[i],
392                                           (const CeedScalar **) &edata[i]);
393         CeedChkBackend(ierr);
394       }
395     }
396   }
397   return CEED_ERROR_SUCCESS;
398 }
399 
400 //------------------------------------------------------------------------------
401 // Apply and add to output
402 //------------------------------------------------------------------------------
403 static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector invec,
404                                     CeedVector outvec, CeedRequest *request) {
405   int ierr;
406   CeedOperator_Hip *impl;
407   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
408   CeedQFunction qf;
409   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
410   CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size;
411   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
412   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
413   CeedOperatorField *opinputfields, *opoutputfields;
414   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
415                                &numoutputfields, &opoutputfields);
416   CeedChkBackend(ierr);
417   CeedQFunctionField *qfinputfields, *qfoutputfields;
418   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
419   CeedChkBackend(ierr);
420   CeedEvalMode emode;
421   CeedVector vec;
422   CeedBasis basis;
423   CeedElemRestriction Erestrict;
424   CeedScalar *edata[2*CEED_FIELD_MAX];
425 
426   // Setup
427   ierr = CeedOperatorSetup_Hip(op); CeedChkBackend(ierr);
428 
429   // Input Evecs and Restriction
430   ierr = CeedOperatorSetupInputs_Hip(numinputfields, qfinputfields,
431                                      opinputfields, invec, false, edata,
432                                      impl, request); CeedChkBackend(ierr);
433 
434   // Input basis apply if needed
435   ierr = CeedOperatorInputBasis_Hip(numelements, qfinputfields, opinputfields,
436                                     numinputfields, false, edata, impl);
437   CeedChkBackend(ierr);
438 
439   // Output pointers, as necessary
440   for (CeedInt i = 0; i < numoutputfields; i++) {
441     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
442     CeedChkBackend(ierr);
443     if (emode == CEED_EVAL_NONE) {
444       // Set the output Q-Vector to use the E-Vector data directly.
445       ierr = CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE,
446                                      &edata[i + numinputfields]); CeedChkBackend(ierr);
447       ierr = CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE,
448                                 CEED_USE_POINTER, edata[i + numinputfields]);
449       CeedChkBackend(ierr);
450     }
451   }
452 
453   // Q function
454   ierr = CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout);
455   CeedChkBackend(ierr);
456 
457   // Output basis apply if needed
458   for (CeedInt i = 0; i < numoutputfields; i++) {
459     // Get elemsize, emode, size
460     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
461     CeedChkBackend(ierr);
462     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
463     CeedChkBackend(ierr);
464     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
465     CeedChkBackend(ierr);
466     ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size);
467     CeedChkBackend(ierr);
468     // Basis action
469     switch (emode) {
470     case CEED_EVAL_NONE:
471       break;
472     case CEED_EVAL_INTERP:
473       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
474       CeedChkBackend(ierr);
475       ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE,
476                             CEED_EVAL_INTERP, impl->qvecsout[i],
477                             impl->evecs[i + impl->numein]); CeedChkBackend(ierr);
478       break;
479     case CEED_EVAL_GRAD:
480       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
481       CeedChkBackend(ierr);
482       ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE,
483                             CEED_EVAL_GRAD, impl->qvecsout[i],
484                             impl->evecs[i + impl->numein]); CeedChkBackend(ierr);
485       break;
486     // LCOV_EXCL_START
487     case CEED_EVAL_WEIGHT: {
488       Ceed ceed;
489       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
490       return CeedError(ceed, CEED_ERROR_BACKEND,
491                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
492       break; // Should not occur
493     }
494     case CEED_EVAL_DIV:
495       break; // TODO: Not implemented
496     case CEED_EVAL_CURL:
497       break; // TODO: Not implemented
498       // LCOV_EXCL_STOP
499     }
500   }
501 
502   // Output restriction
503   for (CeedInt i = 0; i < numoutputfields; i++) {
504     // Restore evec
505     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
506     CeedChkBackend(ierr);
507     if (emode == CEED_EVAL_NONE) {
508       ierr = CeedVectorRestoreArray(impl->evecs[i+impl->numein],
509                                     &edata[i + numinputfields]);
510       CeedChkBackend(ierr);
511     }
512     // Get output vector
513     ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec);
514     CeedChkBackend(ierr);
515     // Restrict
516     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
517     CeedChkBackend(ierr);
518     // Active
519     if (vec == CEED_VECTOR_ACTIVE)
520       vec = outvec;
521 
522     ierr = CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE,
523                                     impl->evecs[i + impl->numein], vec,
524                                     request); CeedChkBackend(ierr);
525   }
526 
527   // Restore input arrays
528   ierr = CeedOperatorRestoreInputs_Hip(numinputfields, qfinputfields,
529                                        opinputfields, false, edata, impl);
530   CeedChkBackend(ierr);
531   return CEED_ERROR_SUCCESS;
532 }
533 
534 //------------------------------------------------------------------------------
535 // Core code for assembling linear QFunction
536 //------------------------------------------------------------------------------
537 static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op,
538     bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
539     CeedRequest *request) {
540   int ierr;
541   CeedOperator_Hip *impl;
542   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
543   CeedQFunction qf;
544   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
545   CeedInt Q, numelements, numinputfields, numoutputfields, size;
546   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
547   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
548   CeedOperatorField *opinputfields, *opoutputfields;
549   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
550                                &numoutputfields, &opoutputfields);
551   CeedChkBackend(ierr);
552   CeedQFunctionField *qfinputfields, *qfoutputfields;
553   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
554   CeedChkBackend(ierr);
555   CeedVector vec;
556   CeedInt numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout;
557   CeedVector *activein = impl->qfactivein;
558   CeedScalar *a, *tmp;
559   Ceed ceed, ceedparent;
560   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
561   ierr = CeedGetOperatorFallbackParentCeed(ceed, &ceedparent);
562   CeedChkBackend(ierr);
563   ceedparent = ceedparent ? ceedparent : ceed;
564   CeedScalar *edata[2*CEED_FIELD_MAX];
565 
566   // Setup
567   ierr = CeedOperatorSetup_Hip(op); CeedChkBackend(ierr);
568 
569   // Check for identity
570   bool identityqf;
571   ierr = CeedQFunctionIsIdentity(qf, &identityqf); CeedChkBackend(ierr);
572   if (identityqf)
573     // LCOV_EXCL_START
574     return CeedError(ceed, CEED_ERROR_BACKEND,
575                      "Assembling identity QFunctions not supported");
576   // LCOV_EXCL_STOP
577 
578   // Input Evecs and Restriction
579   ierr = CeedOperatorSetupInputs_Hip(numinputfields, qfinputfields,
580                                      opinputfields, NULL, true, edata,
581                                      impl, request); CeedChkBackend(ierr);
582 
583   // Count number of active input fields
584   if (!numactivein) {
585     for (CeedInt i=0; i<numinputfields; i++) {
586       // Get input vector
587       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
588       // Check if active input
589       if (vec == CEED_VECTOR_ACTIVE) {
590         ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr);
591         ierr = CeedVectorSetValue(impl->qvecsin[i], 0.0); CeedChkBackend(ierr);
592         ierr = CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp);
593         CeedChkBackend(ierr);
594         ierr = CeedRealloc(numactivein + size, &activein); CeedChkBackend(ierr);
595         for (CeedInt field = 0; field < size; field++) {
596           ierr = CeedVectorCreate(ceed, Q*numelements,
597                                   &activein[numactivein+field]); CeedChkBackend(ierr);
598           ierr = CeedVectorSetArray(activein[numactivein+field], CEED_MEM_DEVICE,
599                                     CEED_USE_POINTER, &tmp[field*Q*numelements]);
600           CeedChkBackend(ierr);
601         }
602         numactivein += size;
603         ierr = CeedVectorRestoreArray(impl->qvecsin[i], &tmp); CeedChkBackend(ierr);
604       }
605     }
606     impl->qfnumactivein = numactivein;
607     impl->qfactivein = activein;
608   }
609 
610   // Count number of active output fields
611   if (!numactiveout) {
612     for (CeedInt i=0; i<numoutputfields; i++) {
613       // Get output vector
614       ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec);
615       CeedChkBackend(ierr);
616       // Check if active output
617       if (vec == CEED_VECTOR_ACTIVE) {
618         ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size);
619         CeedChkBackend(ierr);
620         numactiveout += size;
621       }
622     }
623     impl->qfnumactiveout = numactiveout;
624   }
625 
626   // Check sizes
627   if (!numactivein || !numactiveout)
628     // LCOV_EXCL_START
629     return CeedError(ceed, CEED_ERROR_BACKEND,
630                      "Cannot assemble QFunction without active inputs "
631                      "and outputs");
632   // LCOV_EXCL_STOP
633 
634   // Build objects if needed
635   if (build_objects) {
636     // Create output restriction
637     CeedInt strides[3] = {1, numelements*Q, Q}; /* *NOPAD* */
638     ierr = CeedElemRestrictionCreateStrided(ceedparent, numelements, Q,
639                                             numactivein*numactiveout,
640                                             numactivein*numactiveout*numelements*Q,
641                                             strides, rstr); CeedChkBackend(ierr);
642     // Create assembled vector
643     ierr = CeedVectorCreate(ceedparent, numelements*Q*numactivein*numactiveout,
644                             assembled); CeedChkBackend(ierr);
645   }
646   ierr = CeedVectorSetValue(*assembled, 0.0); CeedChkBackend(ierr);
647   ierr = CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &a);
648   CeedChkBackend(ierr);
649 
650   // Input basis apply
651   ierr = CeedOperatorInputBasis_Hip(numelements, qfinputfields, opinputfields,
652                                     numinputfields, true, edata, impl);
653   CeedChkBackend(ierr);
654 
655   // Assemble QFunction
656   for (CeedInt in=0; in<numactivein; in++) {
657     // Set Inputs
658     ierr = CeedVectorSetValue(activein[in], 1.0); CeedChkBackend(ierr);
659     if (numactivein > 1) {
660       ierr = CeedVectorSetValue(activein[(in+numactivein-1)%numactivein],
661                                 0.0); CeedChkBackend(ierr);
662     }
663     // Set Outputs
664     for (CeedInt out=0; out<numoutputfields; out++) {
665       // Get output vector
666       ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec);
667       CeedChkBackend(ierr);
668       // Check if active output
669       if (vec == CEED_VECTOR_ACTIVE) {
670         CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE,
671                            CEED_USE_POINTER, a); CeedChkBackend(ierr);
672         ierr = CeedQFunctionFieldGetSize(qfoutputfields[out], &size);
673         CeedChkBackend(ierr);
674         a += size*Q*numelements; // Advance the pointer by the size of the output
675       }
676     }
677     // Apply QFunction
678     ierr = CeedQFunctionApply(qf, Q*numelements, impl->qvecsin, impl->qvecsout);
679     CeedChkBackend(ierr);
680   }
681 
682   // Un-set output Qvecs to prevent accidental overwrite of Assembled
683   for (CeedInt out=0; out<numoutputfields; out++) {
684     // Get output vector
685     ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec);
686     CeedChkBackend(ierr);
687     // Check if active output
688     if (vec == CEED_VECTOR_ACTIVE) {
689       ierr = CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL);
690       CeedChkBackend(ierr);
691     }
692   }
693 
694   // Restore input arrays
695   ierr = CeedOperatorRestoreInputs_Hip(numinputfields, qfinputfields,
696                                        opinputfields, true, edata, impl);
697   CeedChkBackend(ierr);
698 
699   // Restore output
700   ierr = CeedVectorRestoreArray(*assembled, &a); CeedChkBackend(ierr);
701 
702   return CEED_ERROR_SUCCESS;
703 }
704 
705 //------------------------------------------------------------------------------
706 // Assemble Linear QFunction
707 //------------------------------------------------------------------------------
708 static int CeedOperatorLinearAssembleQFunction_Hip(CeedOperator op,
709     CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
710   return CeedOperatorLinearAssembleQFunctionCore_Hip(op, true, assembled, rstr,
711          request);
712 }
713 
714 //------------------------------------------------------------------------------
715 // Assemble Linear QFunction
716 //------------------------------------------------------------------------------
717 static int CeedOperatorLinearAssembleQFunctionUpdate_Hip(CeedOperator op,
718     CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
719   return CeedOperatorLinearAssembleQFunctionCore_Hip(op, false, &assembled, &rstr,
720          request);
721 }
722 
723 //------------------------------------------------------------------------------
724 // Diagonal assembly kernels
725 //------------------------------------------------------------------------------
726 // *INDENT-OFF*
727 static const char *diagonalkernels = QUOTE(
728 
729 typedef enum {
730   /// Perform no evaluation (either because there is no data or it is already at
731   /// quadrature points)
732   CEED_EVAL_NONE   = 0,
733   /// Interpolate from nodes to quadrature points
734   CEED_EVAL_INTERP = 1,
735   /// Evaluate gradients at quadrature points from input in a nodal basis
736   CEED_EVAL_GRAD   = 2,
737   /// Evaluate divergence at quadrature points from input in a nodal basis
738   CEED_EVAL_DIV    = 4,
739   /// Evaluate curl at quadrature points from input in a nodal basis
740   CEED_EVAL_CURL   = 8,
741   /// Using no input, evaluate quadrature weights on the reference element
742   CEED_EVAL_WEIGHT = 16,
743 } CeedEvalMode;
744 
745 //------------------------------------------------------------------------------
746 // Get Basis Emode Pointer
747 //------------------------------------------------------------------------------
748 extern "C" __device__ void CeedOperatorGetBasisPointer_Hip(const CeedScalar **basisptr,
749     CeedEvalMode emode, const CeedScalar *identity, const CeedScalar *interp,
750     const CeedScalar *grad) {
751   switch (emode) {
752   case CEED_EVAL_NONE:
753     *basisptr = identity;
754     break;
755   case CEED_EVAL_INTERP:
756     *basisptr = interp;
757     break;
758   case CEED_EVAL_GRAD:
759     *basisptr = grad;
760     break;
761   case CEED_EVAL_WEIGHT:
762   case CEED_EVAL_DIV:
763   case CEED_EVAL_CURL:
764     break; // Caught by QF Assembly
765   }
766 }
767 
768 //------------------------------------------------------------------------------
769 // Core code for diagonal assembly
770 //------------------------------------------------------------------------------
771 __device__ void diagonalCore(const CeedInt nelem,
772     const CeedScalar maxnorm, const bool pointBlock,
773     const CeedScalar *identity,
774     const CeedScalar *interpin, const CeedScalar *gradin,
775     const CeedScalar *interpout, const CeedScalar *gradout,
776     const CeedEvalMode *emodein, const CeedEvalMode *emodeout,
777     const CeedScalar *__restrict__ assembledqfarray,
778     CeedScalar *__restrict__ elemdiagarray) {
779   const int tid = threadIdx.x; // running with P threads, tid is evec node
780   const CeedScalar qfvaluebound = maxnorm*1e-12;
781 
782   // Compute the diagonal of B^T D B
783   // Each element
784   for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < nelem;
785        e += gridDim.x*blockDim.z) {
786     CeedInt dout = -1;
787     // Each basis eval mode pair
788     for (CeedInt eout = 0; eout < NUMEMODEOUT; eout++) {
789       const CeedScalar *bt = NULL;
790       if (emodeout[eout] == CEED_EVAL_GRAD)
791         dout += 1;
792       CeedOperatorGetBasisPointer_Hip(&bt, emodeout[eout], identity, interpout,
793                                       &gradout[dout*NQPTS*NNODES]);
794       CeedInt din = -1;
795       for (CeedInt ein = 0; ein < NUMEMODEIN; ein++) {
796         const CeedScalar *b = NULL;
797         if (emodein[ein] == CEED_EVAL_GRAD)
798           din += 1;
799         CeedOperatorGetBasisPointer_Hip(&b, emodein[ein], identity, interpin,
800                                         &gradin[din*NQPTS*NNODES]);
801         // Each component
802         for (CeedInt compOut = 0; compOut < NCOMP; compOut++) {
803           // Each qpoint/node pair
804           if (pointBlock) {
805             // Point Block Diagonal
806             for (CeedInt compIn = 0; compIn < NCOMP; compIn++) {
807               CeedScalar evalue = 0.;
808               for (CeedInt q = 0; q < NQPTS; q++) {
809                 const CeedScalar qfvalue =
810                   assembledqfarray[((((ein*NCOMP+compIn)*NUMEMODEOUT+eout)*
811                                      NCOMP+compOut)*nelem+e)*NQPTS+q];
812                 if (abs(qfvalue) > qfvaluebound)
813                   evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid];
814               }
815               elemdiagarray[((compOut*NCOMP+compIn)*nelem+e)*NNODES+tid] += evalue;
816             }
817           } else {
818             // Diagonal Only
819             CeedScalar evalue = 0.;
820             for (CeedInt q = 0; q < NQPTS; q++) {
821               const CeedScalar qfvalue =
822                 assembledqfarray[((((ein*NCOMP+compOut)*NUMEMODEOUT+eout)*
823                                    NCOMP+compOut)*nelem+e)*NQPTS+q];
824               if (abs(qfvalue) > qfvaluebound)
825                 evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid];
826             }
827             elemdiagarray[(compOut*nelem+e)*NNODES+tid] += evalue;
828           }
829         }
830       }
831     }
832   }
833 }
834 
835 //------------------------------------------------------------------------------
836 // Linear diagonal
837 //------------------------------------------------------------------------------
838 extern "C" __global__ void linearDiagonal(const CeedInt nelem,
839     const CeedScalar maxnorm, const CeedScalar *identity,
840     const CeedScalar *interpin, const CeedScalar *gradin,
841     const CeedScalar *interpout, const CeedScalar *gradout,
842     const CeedEvalMode *emodein, const CeedEvalMode *emodeout,
843     const CeedScalar *__restrict__ assembledqfarray,
844     CeedScalar *__restrict__ elemdiagarray) {
845   diagonalCore(nelem, maxnorm, false, identity, interpin, gradin, interpout,
846                gradout, emodein, emodeout, assembledqfarray, elemdiagarray);
847 }
848 
849 //------------------------------------------------------------------------------
850 // Linear point block diagonal
851 //------------------------------------------------------------------------------
852 extern "C" __global__ void linearPointBlockDiagonal(const CeedInt nelem,
853     const CeedScalar maxnorm, const CeedScalar *identity,
854     const CeedScalar *interpin, const CeedScalar *gradin,
855     const CeedScalar *interpout, const CeedScalar *gradout,
856     const CeedEvalMode *emodein, const CeedEvalMode *emodeout,
857     const CeedScalar *__restrict__ assembledqfarray,
858     CeedScalar *__restrict__ elemdiagarray) {
859   diagonalCore(nelem, maxnorm, true, identity, interpin, gradin, interpout,
860                gradout, emodein, emodeout, assembledqfarray, elemdiagarray);
861 }
862 
863 );
864 // *INDENT-ON*
865 
866 //------------------------------------------------------------------------------
867 // Create point block restriction
868 //------------------------------------------------------------------------------
869 static int CreatePBRestriction(CeedElemRestriction rstr,
870                                CeedElemRestriction *pbRstr) {
871   int ierr;
872   Ceed ceed;
873   ierr = CeedElemRestrictionGetCeed(rstr, &ceed); CeedChkBackend(ierr);
874   const CeedInt *offsets;
875   ierr = CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets);
876   CeedChkBackend(ierr);
877 
878   // Expand offsets
879   CeedInt nelem, ncomp, elemsize, compstride, max = 1, *pbOffsets;
880   ierr = CeedElemRestrictionGetNumElements(rstr, &nelem); CeedChkBackend(ierr);
881   ierr = CeedElemRestrictionGetNumComponents(rstr, &ncomp); CeedChkBackend(ierr);
882   ierr = CeedElemRestrictionGetElementSize(rstr, &elemsize); CeedChkBackend(ierr);
883   ierr = CeedElemRestrictionGetCompStride(rstr, &compstride);
884   CeedChkBackend(ierr);
885   CeedInt shift = ncomp;
886   if (compstride != 1)
887     shift *= ncomp;
888   ierr = CeedCalloc(nelem*elemsize, &pbOffsets); CeedChkBackend(ierr);
889   for (CeedInt i = 0; i < nelem*elemsize; i++) {
890     pbOffsets[i] = offsets[i]*shift;
891     if (pbOffsets[i] > max)
892       max = pbOffsets[i];
893   }
894 
895   // Create new restriction
896   ierr = CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp*ncomp, 1,
897                                    max + ncomp*ncomp, CEED_MEM_HOST,
898                                    CEED_OWN_POINTER, pbOffsets, pbRstr);
899   CeedChkBackend(ierr);
900 
901   // Cleanup
902   ierr = CeedElemRestrictionRestoreOffsets(rstr, &offsets); CeedChkBackend(ierr);
903 
904   return CEED_ERROR_SUCCESS;
905 }
906 
907 //------------------------------------------------------------------------------
908 // Assemble diagonal setup
909 //------------------------------------------------------------------------------
910 static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op,
911     const bool pointBlock) {
912   int ierr;
913   Ceed ceed;
914   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
915   CeedQFunction qf;
916   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
917   CeedInt numinputfields, numoutputfields;
918   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
919   CeedChkBackend(ierr);
920 
921   // Determine active input basis
922   CeedOperatorField *opfields;
923   CeedQFunctionField *qffields;
924   ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL);
925   CeedChkBackend(ierr);
926   ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL);
927   CeedChkBackend(ierr);
928   CeedInt numemodein = 0, ncomp = 0, dim = 1;
929   CeedEvalMode *emodein = NULL;
930   CeedBasis basisin = NULL;
931   CeedElemRestriction rstrin = NULL;
932   for (CeedInt i = 0; i < numinputfields; i++) {
933     CeedVector vec;
934     ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr);
935     if (vec == CEED_VECTOR_ACTIVE) {
936       CeedElemRestriction rstr;
937       ierr = CeedOperatorFieldGetBasis(opfields[i], &basisin); CeedChkBackend(ierr);
938       ierr = CeedBasisGetNumComponents(basisin, &ncomp); CeedChkBackend(ierr);
939       ierr = CeedBasisGetDimension(basisin, &dim); CeedChkBackend(ierr);
940       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr);
941       CeedChkBackend(ierr);
942       if (rstrin && rstrin != rstr)
943         // LCOV_EXCL_START
944         return CeedError(ceed, CEED_ERROR_BACKEND,
945                          "Multi-field non-composite operator diagonal assembly not supported");
946       // LCOV_EXCL_STOP
947       rstrin = rstr;
948       CeedEvalMode emode;
949       ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode);
950       CeedChkBackend(ierr);
951       switch (emode) {
952       case CEED_EVAL_NONE:
953       case CEED_EVAL_INTERP:
954         ierr = CeedRealloc(numemodein + 1, &emodein); CeedChkBackend(ierr);
955         emodein[numemodein] = emode;
956         numemodein += 1;
957         break;
958       case CEED_EVAL_GRAD:
959         ierr = CeedRealloc(numemodein + dim, &emodein); CeedChkBackend(ierr);
960         for (CeedInt d = 0; d < dim; d++)
961           emodein[numemodein+d] = emode;
962         numemodein += dim;
963         break;
964       case CEED_EVAL_WEIGHT:
965       case CEED_EVAL_DIV:
966       case CEED_EVAL_CURL:
967         break; // Caught by QF Assembly
968       }
969     }
970   }
971 
972   // Determine active output basis
973   ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields);
974   CeedChkBackend(ierr);
975   ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields);
976   CeedChkBackend(ierr);
977   CeedInt numemodeout = 0;
978   CeedEvalMode *emodeout = NULL;
979   CeedBasis basisout = NULL;
980   CeedElemRestriction rstrout = NULL;
981   for (CeedInt i = 0; i < numoutputfields; i++) {
982     CeedVector vec;
983     ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr);
984     if (vec == CEED_VECTOR_ACTIVE) {
985       CeedElemRestriction rstr;
986       ierr = CeedOperatorFieldGetBasis(opfields[i], &basisout); CeedChkBackend(ierr);
987       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr);
988       CeedChkBackend(ierr);
989       if (rstrout && rstrout != rstr)
990         // LCOV_EXCL_START
991         return CeedError(ceed, CEED_ERROR_BACKEND,
992                          "Multi-field non-composite operator diagonal assembly not supported");
993       // LCOV_EXCL_STOP
994       rstrout = rstr;
995       CeedEvalMode emode;
996       ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr);
997       switch (emode) {
998       case CEED_EVAL_NONE:
999       case CEED_EVAL_INTERP:
1000         ierr = CeedRealloc(numemodeout + 1, &emodeout); CeedChkBackend(ierr);
1001         emodeout[numemodeout] = emode;
1002         numemodeout += 1;
1003         break;
1004       case CEED_EVAL_GRAD:
1005         ierr = CeedRealloc(numemodeout + dim, &emodeout); CeedChkBackend(ierr);
1006         for (CeedInt d = 0; d < dim; d++)
1007           emodeout[numemodeout+d] = emode;
1008         numemodeout += dim;
1009         break;
1010       case CEED_EVAL_WEIGHT:
1011       case CEED_EVAL_DIV:
1012       case CEED_EVAL_CURL:
1013         break; // Caught by QF Assembly
1014       }
1015     }
1016   }
1017 
1018   // Operator data struct
1019   CeedOperator_Hip *impl;
1020   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1021   ierr = CeedCalloc(1, &impl->diag); CeedChkBackend(ierr);
1022   CeedOperatorDiag_Hip *diag = impl->diag;
1023   diag->basisin = basisin;
1024   diag->basisout = basisout;
1025   diag->h_emodein = emodein;
1026   diag->h_emodeout = emodeout;
1027   diag->numemodein = numemodein;
1028   diag->numemodeout = numemodeout;
1029 
1030   // Assemble kernel
1031   CeedInt nnodes, nqpts;
1032   ierr = CeedBasisGetNumNodes(basisin, &nnodes); CeedChkBackend(ierr);
1033   ierr = CeedBasisGetNumQuadraturePoints(basisin, &nqpts); CeedChkBackend(ierr);
1034   diag->nnodes = nnodes;
1035   ierr = CeedCompileHip(ceed, diagonalkernels, &diag->module, 5,
1036                         "NUMEMODEIN", numemodein,
1037                         "NUMEMODEOUT", numemodeout,
1038                         "NNODES", nnodes,
1039                         "NQPTS", nqpts,
1040                         "NCOMP", ncomp
1041                        ); CeedChk_Hip(ceed, ierr);
1042   ierr = CeedGetKernelHip(ceed, diag->module, "linearDiagonal",
1043                           &diag->linearDiagonal); CeedChk_Hip(ceed, ierr);
1044   ierr = CeedGetKernelHip(ceed, diag->module, "linearPointBlockDiagonal",
1045                           &diag->linearPointBlock);
1046   CeedChk_Hip(ceed, ierr);
1047 
1048   // Basis matrices
1049   const CeedInt qBytes = nqpts * sizeof(CeedScalar);
1050   const CeedInt iBytes = qBytes * nnodes;
1051   const CeedInt gBytes = qBytes * nnodes * dim;
1052   const CeedInt eBytes = sizeof(CeedEvalMode);
1053   const CeedScalar *interpin, *interpout, *gradin, *gradout;
1054 
1055   // CEED_EVAL_NONE
1056   CeedScalar *identity = NULL;
1057   bool evalNone = false;
1058   for (CeedInt i=0; i<numemodein; i++)
1059     evalNone = evalNone || (emodein[i] == CEED_EVAL_NONE);
1060   for (CeedInt i=0; i<numemodeout; i++)
1061     evalNone = evalNone || (emodeout[i] == CEED_EVAL_NONE);
1062   if (evalNone) {
1063     ierr = CeedCalloc(nqpts*nnodes, &identity); CeedChkBackend(ierr);
1064     for (CeedInt i=0; i<(nnodes<nqpts?nnodes:nqpts); i++)
1065       identity[i*nnodes+i] = 1.0;
1066     ierr = hipMalloc((void **)&diag->d_identity, iBytes); CeedChk_Hip(ceed, ierr);
1067     ierr = hipMemcpy(diag->d_identity, identity, iBytes,
1068                      hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
1069   }
1070 
1071   // CEED_EVAL_INTERP
1072   ierr = CeedBasisGetInterp(basisin, &interpin); CeedChkBackend(ierr);
1073   ierr = hipMalloc((void **)&diag->d_interpin, iBytes); CeedChk_Hip(ceed, ierr);
1074   ierr = hipMemcpy(diag->d_interpin, interpin, iBytes,
1075                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
1076   ierr = CeedBasisGetInterp(basisout, &interpout); CeedChkBackend(ierr);
1077   ierr = hipMalloc((void **)&diag->d_interpout, iBytes); CeedChk_Hip(ceed, ierr);
1078   ierr = hipMemcpy(diag->d_interpout, interpout, iBytes,
1079                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
1080 
1081   // CEED_EVAL_GRAD
1082   ierr = CeedBasisGetGrad(basisin, &gradin); CeedChkBackend(ierr);
1083   ierr = hipMalloc((void **)&diag->d_gradin, gBytes); CeedChk_Hip(ceed, ierr);
1084   ierr = hipMemcpy(diag->d_gradin, gradin, gBytes,
1085                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
1086   ierr = CeedBasisGetGrad(basisout, &gradout); CeedChkBackend(ierr);
1087   ierr = hipMalloc((void **)&diag->d_gradout, gBytes); CeedChk_Hip(ceed, ierr);
1088   ierr = hipMemcpy(diag->d_gradout, gradout, gBytes,
1089                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
1090 
1091   // Arrays of emodes
1092   ierr = hipMalloc((void **)&diag->d_emodein, numemodein * eBytes);
1093   CeedChk_Hip(ceed, ierr);
1094   ierr = hipMemcpy(diag->d_emodein, emodein, numemodein * eBytes,
1095                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
1096   ierr = hipMalloc((void **)&diag->d_emodeout, numemodeout * eBytes);
1097   CeedChk_Hip(ceed, ierr);
1098   ierr = hipMemcpy(diag->d_emodeout, emodeout, numemodeout * eBytes,
1099                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
1100 
1101   // Restriction
1102   diag->diagrstr = rstrout;
1103 
1104   return CEED_ERROR_SUCCESS;
1105 }
1106 
1107 //------------------------------------------------------------------------------
1108 // Assemble diagonal common code
1109 //------------------------------------------------------------------------------
1110 static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op,
1111     CeedVector assembled, CeedRequest *request, const bool pointBlock) {
1112   int ierr;
1113   Ceed ceed;
1114   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1115   CeedOperator_Hip *impl;
1116   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1117 
1118   // Assemble QFunction
1119   CeedVector assembledqf;
1120   CeedElemRestriction rstr;
1121   ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf,
1122          &rstr, request); CeedChkBackend(ierr);
1123   ierr = CeedElemRestrictionDestroy(&rstr); CeedChkBackend(ierr);
1124   CeedScalar maxnorm = 0;
1125   ierr = CeedVectorNorm(assembledqf, CEED_NORM_MAX, &maxnorm);
1126   CeedChkBackend(ierr);
1127 
1128   // Setup
1129   if (!impl->diag) {
1130     ierr = CeedOperatorAssembleDiagonalSetup_Hip(op, pointBlock);
1131     CeedChkBackend(ierr);
1132   }
1133   CeedOperatorDiag_Hip *diag = impl->diag;
1134   assert(diag != NULL);
1135 
1136   // Restriction
1137   if (pointBlock && !diag->pbdiagrstr) {
1138     CeedElemRestriction pbdiagrstr;
1139     ierr = CreatePBRestriction(diag->diagrstr, &pbdiagrstr); CeedChkBackend(ierr);
1140     diag->pbdiagrstr = pbdiagrstr;
1141   }
1142   CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr;
1143 
1144   // Create diagonal vector
1145   CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag;
1146   if (!elemdiag) {
1147     // Element diagonal vector
1148     ierr = CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag);
1149     CeedChkBackend(ierr);
1150     if (pointBlock)
1151       diag->pbelemdiag = elemdiag;
1152     else
1153       diag->elemdiag = elemdiag;
1154   }
1155   ierr = CeedVectorSetValue(elemdiag, 0.0); CeedChkBackend(ierr);
1156 
1157   // Assemble element operator diagonals
1158   CeedScalar *elemdiagarray;
1159   const CeedScalar *assembledqfarray;
1160   ierr = CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray);
1161   CeedChkBackend(ierr);
1162   ierr = CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray);
1163   CeedChkBackend(ierr);
1164   CeedInt nelem;
1165   ierr = CeedElemRestrictionGetNumElements(diagrstr, &nelem);
1166   CeedChkBackend(ierr);
1167 
1168   // Compute the diagonal of B^T D B
1169   int elemsPerBlock = 1;
1170   int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0);
1171   void *args[] = {(void *) &nelem, (void *) &maxnorm, &diag->d_identity,
1172                   &diag->d_interpin, &diag->d_gradin, &diag->d_interpout,
1173                   &diag->d_gradout, &diag->d_emodein, &diag->d_emodeout,
1174                   &assembledqfarray, &elemdiagarray
1175                  };
1176   if (pointBlock) {
1177     ierr = CeedRunKernelDimHip(ceed, diag->linearPointBlock, grid,
1178                                diag->nnodes, 1, elemsPerBlock, args);
1179     CeedChkBackend(ierr);
1180   } else {
1181     ierr = CeedRunKernelDimHip(ceed, diag->linearDiagonal, grid,
1182                                diag->nnodes, 1, elemsPerBlock, args);
1183     CeedChkBackend(ierr);
1184   }
1185 
1186   // Restore arrays
1187   ierr = CeedVectorRestoreArray(elemdiag, &elemdiagarray); CeedChkBackend(ierr);
1188   ierr = CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray);
1189   CeedChkBackend(ierr);
1190 
1191   // Assemble local operator diagonal
1192   ierr = CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag,
1193                                   assembled, request); CeedChkBackend(ierr);
1194 
1195   // Cleanup
1196   ierr = CeedVectorDestroy(&assembledqf); CeedChkBackend(ierr);
1197 
1198   return CEED_ERROR_SUCCESS;
1199 }
1200 
1201 //------------------------------------------------------------------------------
1202 // Assemble composite diagonal common code
1203 //------------------------------------------------------------------------------
1204 static inline int CeedOperatorLinearAssembleAddDiagonalCompositeCore_Hip(
1205   CeedOperator op, CeedVector assembled, CeedRequest *request,
1206   const bool pointBlock) {
1207   int ierr;
1208   CeedInt numSub;
1209   CeedOperator *subOperators;
1210   ierr = CeedOperatorGetNumSub(op, &numSub); CeedChkBackend(ierr);
1211   ierr = CeedOperatorGetSubList(op, &subOperators); CeedChkBackend(ierr);
1212   for (CeedInt i = 0; i < numSub; i++) {
1213     ierr = CeedOperatorAssembleDiagonalCore_Hip(subOperators[i], assembled,
1214            request, pointBlock); CeedChkBackend(ierr);
1215   }
1216   return CEED_ERROR_SUCCESS;
1217 }
1218 
1219 //------------------------------------------------------------------------------
1220 // Assemble Linear Diagonal
1221 //------------------------------------------------------------------------------
1222 static int CeedOperatorLinearAssembleAddDiagonal_Hip(CeedOperator op,
1223     CeedVector assembled, CeedRequest *request) {
1224   int ierr;
1225   bool isComposite;
1226   ierr = CeedOperatorIsComposite(op, &isComposite); CeedChkBackend(ierr);
1227   if (isComposite) {
1228     return CeedOperatorLinearAssembleAddDiagonalCompositeCore_Hip(op, assembled,
1229            request, false);
1230   } else {
1231     return CeedOperatorAssembleDiagonalCore_Hip(op, assembled, request, false);
1232   }
1233 }
1234 
1235 //------------------------------------------------------------------------------
1236 // Assemble Linear Point Block Diagonal
1237 //------------------------------------------------------------------------------
1238 static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op,
1239     CeedVector assembled, CeedRequest *request) {
1240   int ierr;
1241   bool isComposite;
1242   ierr = CeedOperatorIsComposite(op, &isComposite); CeedChkBackend(ierr);
1243   if (isComposite) {
1244     return CeedOperatorLinearAssembleAddDiagonalCompositeCore_Hip(op, assembled,
1245            request, true);
1246   } else {
1247     return CeedOperatorAssembleDiagonalCore_Hip(op, assembled, request, true);
1248   }
1249 }
1250 
1251 //------------------------------------------------------------------------------
1252 // Matrix assembly kernel
1253 //------------------------------------------------------------------------------
1254 // *INDENT-OFF*
1255 static const char *assemblykernels = QUOTE(
1256 extern "C" __launch_bounds__(BLOCK_SIZE)
1257            __global__ void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out,
1258                    const CeedScalar *__restrict__ qf_array,
1259                    CeedScalar *__restrict__ values_array) {
1260 
1261   // This kernel assumes B_in and B_out have the same number of quadrature points and
1262   // basis points.
1263   // TODO: expand to more general cases
1264   const int i = threadIdx.x; // The output row index of each B^TDB operation
1265   const int l = threadIdx.y; // The output column index of each B^TDB operation
1266 			     // such that we have (Bout^T)_ij D_jk Bin_kl = C_il
1267 
1268   // Strides for final output ordering, determined by the reference (interface) implementation of
1269   // the symbolic assembly, slowest --> fastest: element, comp_in, comp_out, node_row, node_col
1270   const CeedInt comp_out_stride = NNODES * NNODES;
1271   const CeedInt comp_in_stride = comp_out_stride * NCOMP;
1272   const CeedInt e_stride = comp_in_stride * NCOMP;
1273   // Strides for QF array, slowest --> fastest:  emode_in, comp_in, emode_out, comp_out, elem, qpt
1274   const CeedInt qe_stride = NQPTS;
1275   const CeedInt qcomp_out_stride = NELEM * qe_stride;
1276   const CeedInt qemode_out_stride = qcomp_out_stride * NCOMP;
1277   const CeedInt qcomp_in_stride = qemode_out_stride * NUMEMODEOUT;
1278   const CeedInt qemode_in_stride = qcomp_in_stride * NCOMP;
1279 
1280   // Loop over each element (if necessary)
1281   for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < NELEM;
1282          e += gridDim.x*blockDim.z) {
1283     for (CeedInt comp_in = 0; comp_in < NCOMP; comp_in++) {
1284       for (CeedInt comp_out = 0; comp_out < NCOMP; comp_out++) {
1285         CeedScalar result = 0.0;
1286         CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e;
1287         for (CeedInt emode_in = 0; emode_in < NUMEMODEIN; emode_in++) {
1288           CeedInt b_in_index = emode_in * NQPTS * NNODES;
1289       	  for (CeedInt emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) {
1290              CeedInt b_out_index = emode_out * NQPTS * NNODES;
1291              CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in;
1292  	     // Perform the B^T D B operation for this 'chunk' of D (the qf_array)
1293             for (CeedInt j = 0; j < NQPTS; j++) {
1294      	      result += B_out[b_out_index + j * NNODES  + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l];
1295 	    }
1296 
1297           }// end of emode_out
1298         } // end of emode_in
1299         CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l;
1300    	values_array[val_index] = result;
1301       } // end of out component
1302     } // end of in component
1303   } // end of element loop
1304 }
1305 );
1306 // *INDENT-ON*
1307 
1308 //------------------------------------------------------------------------------
1309 // Single operator assembly setup
1310 //------------------------------------------------------------------------------
1311 static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op) {
1312   int ierr;
1313   Ceed ceed;
1314   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1315   CeedOperator_Hip *impl;
1316   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1317 
1318   // Get intput and output fields
1319   CeedInt num_input_fields, num_output_fields;
1320   CeedOperatorField *input_fields;
1321   CeedOperatorField *output_fields;
1322   ierr = CeedOperatorGetFields(op, &num_input_fields, &input_fields,
1323                                &num_output_fields, &output_fields); CeedChk(ierr);
1324 
1325   // Determine active input basis eval mode
1326   CeedQFunction qf;
1327   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
1328   CeedQFunctionField *qf_fields;
1329   ierr = CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL); CeedChk(ierr);
1330   // Note that the kernel will treat each dimension of a gradient action separately;
1331   // i.e., when an active input has a CEED_EVAL_GRAD mode, num_emode_in will increment
1332   // by dim.  However, for the purposes of loading the B matrices, it will be treated
1333   // as one mode, and we will load/copy the entire gradient matrix at once, so
1334   // num_B_in_mats_to_load will be incremented by 1.
1335   CeedInt num_emode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0;
1336   CeedEvalMode *eval_mode_in = NULL; //will be of size num_B_in_mats_load
1337   CeedBasis basis_in = NULL;
1338   CeedInt nqpts, esize;
1339   CeedElemRestriction rstr_in = NULL;
1340   for (CeedInt i=0; i<num_input_fields; i++) {
1341     CeedVector vec;
1342     ierr = CeedOperatorFieldGetVector(input_fields[i], &vec); CeedChk(ierr);
1343     if (vec == CEED_VECTOR_ACTIVE) {
1344       ierr = CeedOperatorFieldGetBasis(input_fields[i], &basis_in);
1345       CeedChk(ierr);
1346       ierr = CeedBasisGetDimension(basis_in, &dim); CeedChk(ierr);
1347       ierr = CeedBasisGetNumQuadraturePoints(basis_in, &nqpts); CeedChk(ierr);
1348       ierr = CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in);
1349       ierr = CeedElemRestrictionGetElementSize(rstr_in, &esize); CeedChk(ierr);
1350       CeedChk(ierr);
1351       CeedEvalMode eval_mode;
1352       ierr = CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode);
1353       CeedChk(ierr);
1354       if (eval_mode != CEED_EVAL_NONE) {
1355         ierr = CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in); CeedChk(ierr);
1356         eval_mode_in[num_B_in_mats_to_load] = eval_mode;
1357         num_B_in_mats_to_load += 1;
1358         if (eval_mode == CEED_EVAL_GRAD) {
1359           num_emode_in += dim;
1360           size_B_in += dim * esize * nqpts;
1361         } else {
1362           num_emode_in +=1;
1363           size_B_in += esize * nqpts;
1364         }
1365       }
1366     }
1367   }
1368 
1369   // Determine active output basis; basis_out and rstr_out only used if same as input, TODO
1370   ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields); CeedChk(ierr);
1371   CeedInt num_emode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0;
1372   CeedEvalMode *eval_mode_out = NULL;
1373   CeedBasis basis_out = NULL;
1374   CeedElemRestriction rstr_out = NULL;
1375   for (CeedInt i=0; i<num_output_fields; i++) {
1376     CeedVector vec;
1377     ierr = CeedOperatorFieldGetVector(output_fields[i], &vec); CeedChk(ierr);
1378     if (vec == CEED_VECTOR_ACTIVE) {
1379       ierr = CeedOperatorFieldGetBasis(output_fields[i], &basis_out);
1380       CeedChk(ierr);
1381       ierr = CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out);
1382       CeedChk(ierr);
1383       if (rstr_out && rstr_out != rstr_in)
1384         // LCOV_EXCL_START
1385         return CeedError(ceed, CEED_ERROR_BACKEND,
1386                          "Multi-field non-composite operator assembly not supported");
1387       // LCOV_EXCL_STOP
1388       CeedEvalMode eval_mode;
1389       ierr = CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode);
1390       CeedChk(ierr);
1391       if (eval_mode != CEED_EVAL_NONE) {
1392         ierr = CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out); CeedChk(ierr);
1393         eval_mode_out[num_B_out_mats_to_load] = eval_mode;
1394         num_B_out_mats_to_load += 1;
1395         if (eval_mode == CEED_EVAL_GRAD) {
1396           num_emode_out += dim;
1397           size_B_out += dim * esize * nqpts;
1398         } else {
1399           num_emode_out +=1;
1400           size_B_out += esize * nqpts;
1401         }
1402       }
1403     }
1404   }
1405 
1406   if (num_emode_in == 0 || num_emode_out == 0)
1407     // LCOV_EXCL_START
1408     return CeedError(ceed, CEED_ERROR_UNSUPPORTED,
1409                      "Cannot assemble operator without inputs/outputs");
1410   // LCOV_EXCL_STOP
1411 
1412   CeedInt nelem, ncomp;
1413   ierr = CeedElemRestrictionGetNumElements(rstr_in, &nelem); CeedChk(ierr);
1414   ierr = CeedElemRestrictionGetNumComponents(rstr_in, &ncomp); CeedChk(ierr);
1415 
1416   ierr = CeedCalloc(1, &impl->asmb); CeedChkBackend(ierr);
1417   CeedOperatorAssemble_Hip *asmb = impl->asmb;
1418   asmb->nelem = nelem;
1419   asmb->nnodes = esize;
1420 
1421   // Compile kernels
1422   int elemsPerBlock = 1;
1423   asmb->elemsPerBlock = elemsPerBlock;
1424   // TODO: 1D threadblock version for larger elements
1425   if (esize * esize * elemsPerBlock > 1024)
1426     // LCOV_EXCL_START
1427     return CeedError(ceed, CEED_ERROR_UNSUPPORTED,
1428                      "GPU assembly not currently available for elements of this size");
1429   // LCOV_EXCL_STOP
1430 
1431   ierr = CeedCompileHip(ceed, assemblykernels, &asmb->module, 7,
1432                         "NELEM", nelem,
1433                         "NUMEMODEIN", num_emode_in,
1434                         "NUMEMODEOUT", num_emode_out,
1435                         "NQPTS", nqpts,
1436                         "NNODES", esize,
1437                         "BLOCK_SIZE", esize * esize * elemsPerBlock,
1438                         "NCOMP", ncomp
1439                        ); CeedChk_Hip(ceed, ierr);
1440   ierr = CeedGetKernelHip(ceed, asmb->module, "linearAssemble",
1441                           &asmb->linearAssemble); CeedChk_Hip(ceed, ierr);
1442 
1443   // Build 'full' B matrices (not 1D arrays used for tensor-product matrices)
1444   const CeedScalar *interp_in, *grad_in;
1445   ierr = CeedBasisGetInterp(basis_in, &interp_in); CeedChkBackend(ierr);
1446   ierr = CeedBasisGetGrad(basis_in, &grad_in); CeedChkBackend(ierr);
1447 
1448   // Load into B_in, in order that they will be used in eval_mode
1449   const CeedInt inBytes = size_B_in * sizeof(CeedScalar);
1450   CeedInt mat_start = 0;
1451   ierr = hipMalloc((void **) &asmb->d_B_in, inBytes); CeedChk_Hip(ceed, ierr);
1452   for (int i = 0; i < num_B_in_mats_to_load; i++) {
1453     CeedEvalMode eval_mode = eval_mode_in[i];
1454     if (eval_mode == CEED_EVAL_INTERP) {
1455       ierr = hipMemcpy(&asmb->d_B_in[mat_start], interp_in,
1456                        esize * nqpts * sizeof(CeedScalar),
1457                        hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
1458       mat_start += esize * nqpts;
1459     } else if (eval_mode == CEED_EVAL_GRAD) {
1460       ierr = hipMemcpy(asmb->d_B_in, grad_in,
1461                        dim * esize * nqpts * sizeof(CeedScalar),
1462                        hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
1463       mat_start += dim * esize * nqpts;
1464     }
1465   }
1466 
1467   const CeedScalar *interp_out, *grad_out;
1468   // Note that this function currently assumes 1 basis, so this should always be true
1469   // for now
1470   if (basis_out == basis_in) {
1471     interp_out = interp_in;
1472     grad_out = grad_in;
1473   } else {
1474     ierr = CeedBasisGetInterp(basis_out, &interp_out); CeedChkBackend(ierr);
1475     ierr = CeedBasisGetGrad(basis_out, &grad_out); CeedChkBackend(ierr);
1476   }
1477 
1478   // Load into B_out, in order that they will be used in eval_mode
1479   const CeedInt outBytes = size_B_out * sizeof(CeedScalar);
1480   mat_start = 0;
1481   ierr = hipMalloc((void **) &asmb->d_B_out, outBytes); CeedChk_Hip(ceed, ierr);
1482   for (int i = 0; i < num_B_out_mats_to_load; i++) {
1483     CeedEvalMode eval_mode = eval_mode_out[i];
1484     if (eval_mode == CEED_EVAL_INTERP) {
1485       ierr = hipMemcpy(&asmb->d_B_out[mat_start], interp_out,
1486                        esize * nqpts * sizeof(CeedScalar),
1487                        hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
1488       mat_start += esize * nqpts;
1489     } else if (eval_mode == CEED_EVAL_GRAD) {
1490       ierr = hipMemcpy(&asmb->d_B_out[mat_start], grad_out,
1491                        dim * esize * nqpts * sizeof(CeedScalar),
1492                        hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
1493       mat_start += dim * esize * nqpts;
1494     }
1495   }
1496   return CEED_ERROR_SUCCESS;
1497 }
1498 
1499 //------------------------------------------------------------------------------
1500 // Single operator assembly
1501 //------------------------------------------------------------------------------
1502 static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset,
1503     CeedVector values) {
1504 
1505   int ierr;
1506   Ceed ceed;
1507   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1508   CeedOperator_Hip *impl;
1509   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1510 
1511   // Setup
1512   if (!impl->asmb) {
1513     ierr = CeedSingleOperatorAssembleSetup_Hip(op);
1514     CeedChkBackend(ierr);
1515   }
1516 
1517   // Assemble QFunction
1518   CeedVector assembled_qf;
1519   CeedElemRestriction rstr_q;
1520   ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(
1521            op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
1522   ierr = CeedElemRestrictionDestroy(&rstr_q); CeedChkBackend(ierr);
1523   CeedScalar *values_array;
1524   ierr = CeedVectorGetArrayWrite(values, CEED_MEM_DEVICE, &values_array);
1525   CeedChkBackend(ierr);
1526   values_array += offset;
1527   const CeedScalar *qf_array;
1528   ierr = CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array);
1529   CeedChkBackend(ierr);
1530 
1531   // Compute B^T D B
1532   const CeedInt nelem = impl->asmb->nelem;
1533   const CeedInt elemsPerBlock = impl->asmb->elemsPerBlock;
1534   const CeedInt nnodes = impl->asmb->nnodes;
1535   const CeedInt grid = nelem/elemsPerBlock+((
1536                          nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0);
1537   void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out,
1538                   &qf_array, &values_array
1539                  };
1540   ierr = CeedRunKernelDimHip(ceed, impl->asmb->linearAssemble, grid,
1541                              nnodes, nnodes, elemsPerBlock, args);
1542   CeedChkBackend(ierr);
1543 
1544 
1545   // Restore arrays
1546   ierr = CeedVectorRestoreArray(values, &values_array); CeedChkBackend(ierr);
1547   ierr = CeedVectorRestoreArrayRead(assembled_qf, &qf_array);
1548   CeedChkBackend(ierr);
1549 
1550   // Cleanup
1551   ierr = CeedVectorDestroy(&assembled_qf); CeedChkBackend(ierr);
1552 
1553   return CEED_ERROR_SUCCESS;
1554 }
1555 
1556 //------------------------------------------------------------------------------
1557 // Assemble matrix data for COO matrix of assembled operator.
1558 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic.
1559 //
1560 // Note that this (and other assembly routines) currently assume only one
1561 // active input restriction/basis per operator (could have multiple basis eval
1562 // modes).
1563 // TODO: allow multiple active input restrictions/basis objects
1564 //------------------------------------------------------------------------------
1565 int CeedOperatorLinearAssemble_Hip(CeedOperator op, CeedVector values) {
1566 
1567   // As done in the default implementation, loop through suboperators
1568   // for composite operators, or call single operator assembly otherwise
1569   bool is_composite;
1570   CeedInt ierr;
1571   ierr = CeedOperatorIsComposite(op, &is_composite); CeedChk(ierr);
1572 
1573   CeedElemRestriction rstr;
1574   CeedInt num_elem, elem_size, num_comp;
1575 
1576   CeedInt offset = 0;
1577   if (is_composite) {
1578     CeedInt num_suboperators;
1579     ierr = CeedOperatorGetNumSub(op, &num_suboperators); CeedChk(ierr);
1580     CeedOperator *sub_operators;
1581     ierr = CeedOperatorGetSubList(op, &sub_operators); CeedChk(ierr);
1582     for (int k = 0; k < num_suboperators; ++k) {
1583       ierr = CeedSingleOperatorAssemble_Hip(sub_operators[k], offset, values);
1584       CeedChk(ierr);
1585       ierr = CeedOperatorGetActiveElemRestriction(sub_operators[k], &rstr);
1586       CeedChk(ierr);
1587       ierr = CeedElemRestrictionGetNumElements(rstr, &num_elem); CeedChk(ierr);
1588       ierr = CeedElemRestrictionGetElementSize(rstr, &elem_size); CeedChk(ierr);
1589       ierr = CeedElemRestrictionGetNumComponents(rstr, &num_comp); CeedChk(ierr);
1590       offset += elem_size*num_comp * elem_size*num_comp * num_elem;
1591     }
1592   } else {
1593     ierr = CeedSingleOperatorAssemble_Hip(op, offset, values); CeedChk(ierr);
1594   }
1595 
1596   return CEED_ERROR_SUCCESS;
1597 }
1598 
1599 //------------------------------------------------------------------------------
1600 // Create operator
1601 //------------------------------------------------------------------------------
1602 int CeedOperatorCreate_Hip(CeedOperator op) {
1603   int ierr;
1604   Ceed ceed;
1605   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1606   CeedOperator_Hip *impl;
1607 
1608   ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr);
1609   ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr);
1610 
1611   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction",
1612                                 CeedOperatorLinearAssembleQFunction_Hip);
1613   CeedChkBackend(ierr);
1614   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1615                                 "LinearAssembleQFunctionUpdate",
1616                                 CeedOperatorLinearAssembleQFunctionUpdate_Hip);
1617   CeedChkBackend(ierr);
1618   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal",
1619                                 CeedOperatorLinearAssembleAddDiagonal_Hip);
1620   CeedChkBackend(ierr);
1621   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1622                                 "LinearAssembleAddPointBlockDiagonal",
1623                                 CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip);
1624   CeedChkBackend(ierr);
1625   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1626                                 "LinearAssemble", CeedOperatorLinearAssemble_Hip);
1627   CeedChkBackend(ierr);
1628   ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd",
1629                                 CeedOperatorApplyAdd_Hip); CeedChkBackend(ierr);
1630   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy",
1631                                 CeedOperatorDestroy_Hip); CeedChkBackend(ierr);
1632   return CEED_ERROR_SUCCESS;
1633 }
1634 
1635 //------------------------------------------------------------------------------
1636 // Composite Operator Create
1637 //------------------------------------------------------------------------------
1638 int CeedCompositeOperatorCreate_Hip(CeedOperator op) {
1639   int ierr;
1640   Ceed ceed;
1641   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1642 
1643   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal",
1644                                 CeedOperatorLinearAssembleAddDiagonal_Hip);
1645   CeedChkBackend(ierr);
1646   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1647                                 "LinearAssembleAddPointBlockDiagonal",
1648                                 CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip);
1649   CeedChkBackend(ierr);
1650   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1651                                 "LinearAssemble", CeedOperatorLinearAssemble_Hip);
1652   CeedChkBackend(ierr);
1653   return CEED_ERROR_SUCCESS;
1654 }
1655 //------------------------------------------------------------------------------
1656