xref: /libCEED/backends/cuda-ref/ceed-cuda-ref-operator.c (revision a0154adecfab8547cdc0febbbf40ac009dbe9d1d)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/ceed.h>
9 #include <ceed/backend.h>
10 #include <assert.h>
11 #include <cuda.h>
12 #include <cuda_runtime.h>
13 #include <stdbool.h>
14 #include <string.h>
15 #include "ceed-cuda-ref.h"
16 #include "../cuda/ceed-cuda-compile.h"
17 
18 //------------------------------------------------------------------------------
19 // Destroy operator
20 //------------------------------------------------------------------------------
21 static int CeedOperatorDestroy_Cuda(CeedOperator op) {
22   int ierr;
23   CeedOperator_Cuda *impl;
24   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
25 
26   // Apply data
27   for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) {
28     ierr = CeedVectorDestroy(&impl->evecs[i]); CeedChkBackend(ierr);
29   }
30   ierr = CeedFree(&impl->evecs); CeedChkBackend(ierr);
31 
32   for (CeedInt i = 0; i < impl->numein; i++) {
33     ierr = CeedVectorDestroy(&impl->qvecsin[i]); CeedChkBackend(ierr);
34   }
35   ierr = CeedFree(&impl->qvecsin); CeedChkBackend(ierr);
36 
37   for (CeedInt i = 0; i < impl->numeout; i++) {
38     ierr = CeedVectorDestroy(&impl->qvecsout[i]); CeedChkBackend(ierr);
39   }
40   ierr = CeedFree(&impl->qvecsout); CeedChkBackend(ierr);
41 
42   // QFunction assembly data
43   for (CeedInt i=0; i<impl->qfnumactivein; i++) {
44     ierr = CeedVectorDestroy(&impl->qfactivein[i]); CeedChkBackend(ierr);
45   }
46   ierr = CeedFree(&impl->qfactivein); CeedChkBackend(ierr);
47 
48   // Diag data
49   if (impl->diag) {
50     Ceed ceed;
51     ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
52     CeedChk_Cu(ceed, cuModuleUnload(impl->diag->module));
53     ierr = CeedFree(&impl->diag->h_emodein); CeedChkBackend(ierr);
54     ierr = CeedFree(&impl->diag->h_emodeout); CeedChkBackend(ierr);
55     ierr = cudaFree(impl->diag->d_emodein); CeedChk_Cu(ceed, ierr);
56     ierr = cudaFree(impl->diag->d_emodeout); CeedChk_Cu(ceed, ierr);
57     ierr = cudaFree(impl->diag->d_identity); CeedChk_Cu(ceed, ierr);
58     ierr = cudaFree(impl->diag->d_interpin); CeedChk_Cu(ceed, ierr);
59     ierr = cudaFree(impl->diag->d_interpout); CeedChk_Cu(ceed, ierr);
60     ierr = cudaFree(impl->diag->d_gradin); CeedChk_Cu(ceed, ierr);
61     ierr = cudaFree(impl->diag->d_gradout); CeedChk_Cu(ceed, ierr);
62     ierr = CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr);
63     CeedChkBackend(ierr);
64     ierr = CeedVectorDestroy(&impl->diag->elemdiag); CeedChkBackend(ierr);
65     ierr = CeedVectorDestroy(&impl->diag->pbelemdiag); CeedChkBackend(ierr);
66   }
67   ierr = CeedFree(&impl->diag); CeedChkBackend(ierr);
68 
69   if (impl->asmb) {
70     Ceed ceed;
71     ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
72     CeedChk_Cu(ceed, cuModuleUnload(impl->asmb->module));
73     ierr = cudaFree(impl->asmb->d_B_in); CeedChk_Cu(ceed, ierr);
74     ierr = cudaFree(impl->asmb->d_B_out); CeedChk_Cu(ceed, ierr);
75   }
76   ierr = CeedFree(&impl->asmb); CeedChkBackend(ierr);
77 
78   ierr = CeedFree(&impl); CeedChkBackend(ierr);
79   return CEED_ERROR_SUCCESS;
80 }
81 
82 //------------------------------------------------------------------------------
83 // Setup infields or outfields
84 //------------------------------------------------------------------------------
85 static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op,
86                                         bool isinput, CeedVector *evecs,
87                                         CeedVector *qvecs, CeedInt starte,
88                                         CeedInt numfields, CeedInt Q,
89                                         CeedInt numelements) {
90   CeedInt dim, ierr, size;
91   CeedSize q_size;
92   Ceed ceed;
93   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
94   CeedBasis basis;
95   CeedElemRestriction Erestrict;
96   CeedOperatorField *opfields;
97   CeedQFunctionField *qffields;
98   CeedVector fieldvec;
99   bool strided;
100   bool skiprestrict;
101 
102   if (isinput) {
103     ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL);
104     CeedChkBackend(ierr);
105     ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL);
106     CeedChkBackend(ierr);
107   } else {
108     ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields);
109     CeedChkBackend(ierr);
110     ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields);
111     CeedChkBackend(ierr);
112   }
113 
114   // Loop over fields
115   for (CeedInt i = 0; i < numfields; i++) {
116     CeedEvalMode emode;
117     ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr);
118 
119     strided = false;
120     skiprestrict = false;
121     if (emode != CEED_EVAL_WEIGHT) {
122       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict);
123       CeedChkBackend(ierr);
124 
125       // Check whether this field can skip the element restriction:
126       // must be passive input, with emode NONE, and have a strided restriction with
127       // CEED_STRIDES_BACKEND.
128 
129       // First, check whether the field is input or output:
130       if (isinput) {
131         // Check for passive input:
132         ierr = CeedOperatorFieldGetVector(opfields[i], &fieldvec); CeedChkBackend(ierr);
133         if (fieldvec != CEED_VECTOR_ACTIVE) {
134           // Check emode
135           if (emode == CEED_EVAL_NONE) {
136             // Check for strided restriction
137             ierr = CeedElemRestrictionIsStrided(Erestrict, &strided);
138             CeedChkBackend(ierr);
139             if (strided) {
140               // Check if vector is already in preferred backend ordering
141               ierr = CeedElemRestrictionHasBackendStrides(Erestrict,
142                      &skiprestrict); CeedChkBackend(ierr);
143             }
144           }
145         }
146       }
147       if (skiprestrict) {
148         // We do not need an E-Vector, but will use the input field vector's data
149         // directly in the operator application.
150         evecs[i + starte] = NULL;
151       } else {
152         ierr = CeedElemRestrictionCreateVector(Erestrict, NULL,
153                                                &evecs[i + starte]);
154         CeedChkBackend(ierr);
155       }
156     }
157 
158     switch (emode) {
159     case CEED_EVAL_NONE:
160       ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr);
161       q_size = (CeedSize)numelements * Q * size;
162       ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr);
163       break;
164     case CEED_EVAL_INTERP:
165       ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr);
166       q_size = (CeedSize)numelements * Q * size;
167       ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr);
168       break;
169     case CEED_EVAL_GRAD:
170       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr);
171       ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr);
172       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
173       q_size = (CeedSize)numelements * Q * size;
174       ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr);
175       break;
176     case CEED_EVAL_WEIGHT: // Only on input fields
177       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr);
178       q_size = (CeedSize)numelements * Q;
179       ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr);
180       ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE,
181                             CEED_EVAL_WEIGHT, NULL, qvecs[i]); CeedChkBackend(ierr);
182       break;
183     case CEED_EVAL_DIV:
184       break; // TODO: Not implemented
185     case CEED_EVAL_CURL:
186       break; // TODO: Not implemented
187     }
188   }
189   return CEED_ERROR_SUCCESS;
190 }
191 
192 //------------------------------------------------------------------------------
193 // CeedOperator needs to connect all the named fields (be they active or passive)
194 //   to the named inputs and outputs of its CeedQFunction.
195 //------------------------------------------------------------------------------
196 static int CeedOperatorSetup_Cuda(CeedOperator op) {
197   int ierr;
198   bool setupdone;
199   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
200   if (setupdone)
201     return CEED_ERROR_SUCCESS;
202   Ceed ceed;
203   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
204   CeedOperator_Cuda *impl;
205   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
206   CeedQFunction qf;
207   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
208   CeedInt Q, numelements, numinputfields, numoutputfields;
209   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
210   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
211   CeedChkBackend(ierr);
212   CeedOperatorField *opinputfields, *opoutputfields;
213   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
214                                &numoutputfields, &opoutputfields);
215   CeedChkBackend(ierr);
216   CeedQFunctionField *qfinputfields, *qfoutputfields;
217   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
218   CeedChkBackend(ierr);
219 
220   // Allocate
221   ierr = CeedCalloc(numinputfields + numoutputfields, &impl->evecs);
222   CeedChkBackend(ierr);
223 
224   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin); CeedChkBackend(ierr);
225   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout); CeedChkBackend(ierr);
226 
227   impl->numein = numinputfields; impl->numeout = numoutputfields;
228 
229   // Set up infield and outfield evecs and qvecs
230   // Infields
231   ierr = CeedOperatorSetupFields_Cuda(qf, op, true,
232                                       impl->evecs, impl->qvecsin, 0,
233                                       numinputfields, Q, numelements);
234   CeedChkBackend(ierr);
235 
236   // Outfields
237   ierr = CeedOperatorSetupFields_Cuda(qf, op, false,
238                                       impl->evecs, impl->qvecsout,
239                                       numinputfields, numoutputfields, Q,
240                                       numelements); CeedChkBackend(ierr);
241 
242   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
243   return CEED_ERROR_SUCCESS;
244 }
245 
246 //------------------------------------------------------------------------------
247 // Setup Operator Inputs
248 //------------------------------------------------------------------------------
249 static inline int CeedOperatorSetupInputs_Cuda(CeedInt numinputfields,
250     CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
251     CeedVector invec, const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX],
252     CeedOperator_Cuda *impl, CeedRequest *request) {
253   CeedInt ierr;
254   CeedEvalMode emode;
255   CeedVector vec;
256   CeedElemRestriction Erestrict;
257 
258   for (CeedInt i = 0; i < numinputfields; i++) {
259     // Get input vector
260     ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
261     if (vec == CEED_VECTOR_ACTIVE) {
262       if (skipactive)
263         continue;
264       else
265         vec = invec;
266     }
267 
268     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
269     CeedChkBackend(ierr);
270     if (emode == CEED_EVAL_WEIGHT) { // Skip
271     } else {
272       // Get input vector
273       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
274       // Get input element restriction
275       ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
276       CeedChkBackend(ierr);
277       if (vec == CEED_VECTOR_ACTIVE)
278         vec = invec;
279       // Restrict, if necessary
280       if (!impl->evecs[i]) {
281         // No restriction for this field; read data directly from vec.
282         ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE,
283                                       (const CeedScalar **) &edata[i]);
284         CeedChkBackend(ierr);
285       } else {
286         ierr = CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec,
287                                         impl->evecs[i], request); CeedChkBackend(ierr);
288         // Get evec
289         ierr = CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE,
290                                       (const CeedScalar **) &edata[i]);
291         CeedChkBackend(ierr);
292       }
293     }
294   }
295   return CEED_ERROR_SUCCESS;
296 }
297 
298 //------------------------------------------------------------------------------
299 // Input Basis Action
300 //------------------------------------------------------------------------------
301 static inline int CeedOperatorInputBasis_Cuda(CeedInt numelements,
302     CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
303     CeedInt numinputfields, const bool skipactive,
304     CeedScalar *edata[2*CEED_FIELD_MAX],CeedOperator_Cuda *impl) {
305   CeedInt ierr;
306   CeedInt elemsize, size;
307   CeedElemRestriction Erestrict;
308   CeedEvalMode emode;
309   CeedBasis basis;
310 
311   for (CeedInt i=0; i<numinputfields; i++) {
312     // Skip active input
313     if (skipactive) {
314       CeedVector vec;
315       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
316       if (vec == CEED_VECTOR_ACTIVE)
317         continue;
318     }
319     // Get elemsize, emode, size
320     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
321     CeedChkBackend(ierr);
322     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
323     CeedChkBackend(ierr);
324     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
325     CeedChkBackend(ierr);
326     ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr);
327     // Basis action
328     switch (emode) {
329     case CEED_EVAL_NONE:
330       ierr = CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE,
331                                 CEED_USE_POINTER, edata[i]); CeedChkBackend(ierr);
332       break;
333     case CEED_EVAL_INTERP:
334       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis);
335       CeedChkBackend(ierr);
336       ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE,
337                             CEED_EVAL_INTERP, impl->evecs[i],
338                             impl->qvecsin[i]); CeedChkBackend(ierr);
339       break;
340     case CEED_EVAL_GRAD:
341       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis);
342       CeedChkBackend(ierr);
343       ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE,
344                             CEED_EVAL_GRAD, impl->evecs[i],
345                             impl->qvecsin[i]); CeedChkBackend(ierr);
346       break;
347     case CEED_EVAL_WEIGHT:
348       break; // No action
349     case CEED_EVAL_DIV:
350       break; // TODO: Not implemented
351     case CEED_EVAL_CURL:
352       break; // TODO: Not implemented
353     }
354   }
355   return CEED_ERROR_SUCCESS;
356 }
357 
358 //------------------------------------------------------------------------------
359 // Restore Input Vectors
360 //------------------------------------------------------------------------------
361 static inline int CeedOperatorRestoreInputs_Cuda(CeedInt numinputfields,
362     CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
363     const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX],
364     CeedOperator_Cuda *impl) {
365   CeedInt ierr;
366   CeedEvalMode emode;
367   CeedVector vec;
368 
369   for (CeedInt i = 0; i < numinputfields; i++) {
370     // Skip active input
371     if (skipactive) {
372       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
373       if (vec == CEED_VECTOR_ACTIVE)
374         continue;
375     }
376     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
377     CeedChkBackend(ierr);
378     if (emode == CEED_EVAL_WEIGHT) { // Skip
379     } else {
380       if (!impl->evecs[i]) {  // This was a skiprestrict case
381         ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
382         ierr = CeedVectorRestoreArrayRead(vec,
383                                           (const CeedScalar **)&edata[i]);
384         CeedChkBackend(ierr);
385       } else {
386         ierr = CeedVectorRestoreArrayRead(impl->evecs[i],
387                                           (const CeedScalar **) &edata[i]);
388         CeedChkBackend(ierr);
389       }
390     }
391   }
392   return CEED_ERROR_SUCCESS;
393 }
394 
395 //------------------------------------------------------------------------------
396 // Apply and add to output
397 //------------------------------------------------------------------------------
398 static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector invec,
399                                      CeedVector outvec, CeedRequest *request) {
400   int ierr;
401   CeedOperator_Cuda *impl;
402   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
403   CeedQFunction qf;
404   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
405   CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size;
406   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
407   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
408   CeedChkBackend(ierr);
409   CeedOperatorField *opinputfields, *opoutputfields;
410   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
411                                &numoutputfields, &opoutputfields);
412   CeedChkBackend(ierr);
413   CeedQFunctionField *qfinputfields, *qfoutputfields;
414   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
415   CeedChkBackend(ierr);
416   CeedEvalMode emode;
417   CeedVector vec;
418   CeedBasis basis;
419   CeedElemRestriction Erestrict;
420   CeedScalar *edata[2*CEED_FIELD_MAX] = {0};
421 
422   // Setup
423   ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr);
424 
425   // Input Evecs and Restriction
426   ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields,
427                                       opinputfields, invec, false, edata,
428                                       impl, request); CeedChkBackend(ierr);
429 
430   // Input basis apply if needed
431   ierr = CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields,
432                                      numinputfields, false, edata, impl);
433   CeedChkBackend(ierr);
434 
435   // Output pointers, as necessary
436   for (CeedInt i = 0; i < numoutputfields; i++) {
437     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
438     CeedChkBackend(ierr);
439     if (emode == CEED_EVAL_NONE) {
440       // Set the output Q-Vector to use the E-Vector data directly.
441       ierr = CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE,
442                                      &edata[i + numinputfields]); CeedChkBackend(ierr);
443       ierr = CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE,
444                                 CEED_USE_POINTER, edata[i + numinputfields]);
445       CeedChkBackend(ierr);
446     }
447   }
448 
449   // Q function
450   ierr = CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout);
451   CeedChkBackend(ierr);
452 
453   // Output basis apply if needed
454   for (CeedInt i = 0; i < numoutputfields; i++) {
455     // Get elemsize, emode, size
456     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
457     CeedChkBackend(ierr);
458     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
459     CeedChkBackend(ierr);
460     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
461     CeedChkBackend(ierr);
462     ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size);
463     CeedChkBackend(ierr);
464     // Basis action
465     switch (emode) {
466     case CEED_EVAL_NONE:
467       break;
468     case CEED_EVAL_INTERP:
469       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
470       CeedChkBackend(ierr);
471       ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE,
472                             CEED_EVAL_INTERP, impl->qvecsout[i],
473                             impl->evecs[i + impl->numein]); CeedChkBackend(ierr);
474       break;
475     case CEED_EVAL_GRAD:
476       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
477       CeedChkBackend(ierr);
478       ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE,
479                             CEED_EVAL_GRAD, impl->qvecsout[i],
480                             impl->evecs[i + impl->numein]); CeedChkBackend(ierr);
481       break;
482     // LCOV_EXCL_START
483     case CEED_EVAL_WEIGHT: {
484       Ceed ceed;
485       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
486       return CeedError(ceed, CEED_ERROR_BACKEND,
487                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
488       break; // Should not occur
489     }
490     case CEED_EVAL_DIV:
491       break; // TODO: Not implemented
492     case CEED_EVAL_CURL:
493       break; // TODO: Not implemented
494       // LCOV_EXCL_STOP
495     }
496   }
497 
498   // Output restriction
499   for (CeedInt i = 0; i < numoutputfields; i++) {
500     // Restore evec
501     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
502     CeedChkBackend(ierr);
503     if (emode == CEED_EVAL_NONE) {
504       ierr = CeedVectorRestoreArray(impl->evecs[i+impl->numein],
505                                     &edata[i + numinputfields]);
506       CeedChkBackend(ierr);
507     }
508     // Get output vector
509     ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec);
510     CeedChkBackend(ierr);
511     // Restrict
512     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
513     CeedChkBackend(ierr);
514     // Active
515     if (vec == CEED_VECTOR_ACTIVE)
516       vec = outvec;
517 
518     ierr = CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE,
519                                     impl->evecs[i + impl->numein], vec,
520                                     request); CeedChkBackend(ierr);
521   }
522 
523   // Restore input arrays
524   ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields,
525                                         opinputfields, false, edata, impl);
526   CeedChkBackend(ierr);
527   return CEED_ERROR_SUCCESS;
528 }
529 
530 //------------------------------------------------------------------------------
531 // Core code for assembling linear QFunction
532 //------------------------------------------------------------------------------
533 static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
534     bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
535     CeedRequest *request) {
536   int ierr;
537   CeedOperator_Cuda *impl;
538   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
539   CeedQFunction qf;
540   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
541   CeedInt Q, numelements, numinputfields, numoutputfields, size;
542   CeedSize q_size;
543   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
544   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
545   CeedChkBackend(ierr);
546   CeedOperatorField *opinputfields, *opoutputfields;
547   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
548                                &numoutputfields, &opoutputfields);
549   CeedChkBackend(ierr);
550   CeedQFunctionField *qfinputfields, *qfoutputfields;
551   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
552   CeedChkBackend(ierr);
553   CeedVector vec;
554   CeedInt numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout;
555   CeedVector *activein = impl->qfactivein;
556   CeedScalar *a, *tmp;
557   Ceed ceed, ceedparent;
558   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
559   ierr = CeedGetOperatorFallbackParentCeed(ceed, &ceedparent);
560   CeedChkBackend(ierr);
561   ceedparent = ceedparent ? ceedparent : ceed;
562   CeedScalar *edata[2*CEED_FIELD_MAX];
563 
564   // Setup
565   ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr);
566 
567   // Check for identity
568   bool identityqf;
569   ierr = CeedQFunctionIsIdentity(qf, &identityqf); CeedChkBackend(ierr);
570   if (identityqf)
571     // LCOV_EXCL_START
572     return CeedError(ceed, CEED_ERROR_BACKEND,
573                      "Assembling identity QFunctions not supported");
574   // LCOV_EXCL_STOP
575 
576   // Input Evecs and Restriction
577   ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields,
578                                       opinputfields, NULL, true, edata,
579                                       impl, request); CeedChkBackend(ierr);
580 
581   // Count number of active input fields
582   if (!numactivein) {
583     for (CeedInt i=0; i<numinputfields; i++) {
584       // Get input vector
585       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
586       // Check if active input
587       if (vec == CEED_VECTOR_ACTIVE) {
588         ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr);
589         ierr = CeedVectorSetValue(impl->qvecsin[i], 0.0); CeedChkBackend(ierr);
590         ierr = CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp);
591         CeedChkBackend(ierr);
592         ierr = CeedRealloc(numactivein + size, &activein); CeedChkBackend(ierr);
593         for (CeedInt field = 0; field < size; field++) {
594           q_size = (CeedSize)Q*numelements;
595           ierr = CeedVectorCreate(ceed, q_size, &activein[numactivein+field]);
596           CeedChkBackend(ierr);
597           ierr = CeedVectorSetArray(activein[numactivein+field], CEED_MEM_DEVICE,
598                                     CEED_USE_POINTER, &tmp[field*Q*numelements]);
599           CeedChkBackend(ierr);
600         }
601         numactivein += size;
602         ierr = CeedVectorRestoreArray(impl->qvecsin[i], &tmp); CeedChkBackend(ierr);
603       }
604     }
605     impl->qfnumactivein = numactivein;
606     impl->qfactivein = activein;
607   }
608 
609   // Count number of active output fields
610   if (!numactiveout) {
611     for (CeedInt i=0; i<numoutputfields; i++) {
612       // Get output vector
613       ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec);
614       CeedChkBackend(ierr);
615       // Check if active output
616       if (vec == CEED_VECTOR_ACTIVE) {
617         ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size);
618         CeedChkBackend(ierr);
619         numactiveout += size;
620       }
621     }
622     impl->qfnumactiveout = numactiveout;
623   }
624 
625   // Check sizes
626   if (!numactivein || !numactiveout)
627     // LCOV_EXCL_START
628     return CeedError(ceed, CEED_ERROR_BACKEND,
629                      "Cannot assemble QFunction without active inputs "
630                      "and outputs");
631   // LCOV_EXCL_STOP
632 
633   // Build objects if needed
634   if (build_objects) {
635     // Create output restriction
636     CeedInt strides[3] = {1, numelements*Q, Q}; /* *NOPAD* */
637     ierr = CeedElemRestrictionCreateStrided(ceedparent, numelements, Q,
638                                             numactivein*numactiveout,
639                                             numactivein*numactiveout*numelements*Q,
640                                             strides, rstr); CeedChkBackend(ierr);
641     // Create assembled vector
642     CeedSize l_size = (CeedSize)numelements*Q*numactivein*numactiveout;
643     ierr = CeedVectorCreate(ceedparent, l_size, assembled); CeedChkBackend(ierr);
644   }
645   ierr = CeedVectorSetValue(*assembled, 0.0); CeedChkBackend(ierr);
646   ierr = CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &a);
647   CeedChkBackend(ierr);
648 
649   // Input basis apply
650   ierr = CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields,
651                                      numinputfields, true, edata, impl);
652   CeedChkBackend(ierr);
653 
654   // Assemble QFunction
655   for (CeedInt in=0; in<numactivein; in++) {
656     // Set Inputs
657     ierr = CeedVectorSetValue(activein[in], 1.0); CeedChkBackend(ierr);
658     if (numactivein > 1) {
659       ierr = CeedVectorSetValue(activein[(in+numactivein-1)%numactivein],
660                                 0.0); CeedChkBackend(ierr);
661     }
662     // Set Outputs
663     for (CeedInt out=0; out<numoutputfields; out++) {
664       // Get output vector
665       ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec);
666       CeedChkBackend(ierr);
667       // Check if active output
668       if (vec == CEED_VECTOR_ACTIVE) {
669         CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE,
670                            CEED_USE_POINTER, a); CeedChkBackend(ierr);
671         ierr = CeedQFunctionFieldGetSize(qfoutputfields[out], &size);
672         CeedChkBackend(ierr);
673         a += size*Q*numelements; // Advance the pointer by the size of the output
674       }
675     }
676     // Apply QFunction
677     ierr = CeedQFunctionApply(qf, Q*numelements, impl->qvecsin, impl->qvecsout);
678     CeedChkBackend(ierr);
679   }
680 
681   // Un-set output Qvecs to prevent accidental overwrite of Assembled
682   for (CeedInt out=0; out<numoutputfields; out++) {
683     // Get output vector
684     ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec);
685     CeedChkBackend(ierr);
686     // Check if active output
687     if (vec == CEED_VECTOR_ACTIVE) {
688       ierr = CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL);
689       CeedChkBackend(ierr);
690     }
691   }
692 
693   // Restore input arrays
694   ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields,
695                                         opinputfields, true, edata, impl);
696   CeedChkBackend(ierr);
697 
698   // Restore output
699   ierr = CeedVectorRestoreArray(*assembled, &a); CeedChkBackend(ierr);
700 
701   return CEED_ERROR_SUCCESS;
702 }
703 
704 //------------------------------------------------------------------------------
705 // Assemble Linear QFunction
706 //------------------------------------------------------------------------------
707 static int CeedOperatorLinearAssembleQFunction_Cuda(CeedOperator op,
708     CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
709   return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, true, assembled, rstr,
710          request);
711 }
712 
713 //------------------------------------------------------------------------------
714 // Update Assembled Linear QFunction
715 //------------------------------------------------------------------------------
716 static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op,
717     CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
718   return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, false, &assembled,
719          &rstr, request);
720 }
721 
722 //------------------------------------------------------------------------------
723 // Diagonal assembly kernels
724 //------------------------------------------------------------------------------
725 // *INDENT-OFF*
726 static const char *diagonalkernels = QUOTE(
727 
728 typedef enum {
729   /// Perform no evaluation (either because there is no data or it is already at
730   /// quadrature points)
731   CEED_EVAL_NONE   = 0,
732   /// Interpolate from nodes to quadrature points
733   CEED_EVAL_INTERP = 1,
734   /// Evaluate gradients at quadrature points from input in a nodal basis
735   CEED_EVAL_GRAD   = 2,
736   /// Evaluate divergence at quadrature points from input in a nodal basis
737   CEED_EVAL_DIV    = 4,
738   /// Evaluate curl at quadrature points from input in a nodal basis
739   CEED_EVAL_CURL   = 8,
740   /// Using no input, evaluate quadrature weights on the reference element
741   CEED_EVAL_WEIGHT = 16,
742 } CeedEvalMode;
743 
744 //------------------------------------------------------------------------------
745 // Get Basis Emode Pointer
746 //------------------------------------------------------------------------------
747 extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **basisptr,
748     CeedEvalMode emode, const CeedScalar *identity, const CeedScalar *interp,
749     const CeedScalar *grad) {
750   switch (emode) {
751   case CEED_EVAL_NONE:
752     *basisptr = identity;
753     break;
754   case CEED_EVAL_INTERP:
755     *basisptr = interp;
756     break;
757   case CEED_EVAL_GRAD:
758     *basisptr = grad;
759     break;
760   case CEED_EVAL_WEIGHT:
761   case CEED_EVAL_DIV:
762   case CEED_EVAL_CURL:
763     break; // Caught by QF Assembly
764   }
765 }
766 
767 //------------------------------------------------------------------------------
768 // Core code for diagonal assembly
769 //------------------------------------------------------------------------------
770 __device__ void diagonalCore(const CeedInt nelem,
771     const CeedScalar maxnorm, const bool pointBlock,
772     const CeedScalar *identity,
773     const CeedScalar *interpin, const CeedScalar *gradin,
774     const CeedScalar *interpout, const CeedScalar *gradout,
775     const CeedEvalMode *emodein, const CeedEvalMode *emodeout,
776     const CeedScalar *__restrict__ assembledqfarray,
777     CeedScalar *__restrict__ elemdiagarray) {
778   const int tid = threadIdx.x; // running with P threads, tid is evec node
779   const CeedScalar qfvaluebound = maxnorm*1e-12;
780 
781   // Compute the diagonal of B^T D B
782   // Each element
783   for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < nelem;
784        e += gridDim.x*blockDim.z) {
785     CeedInt dout = -1;
786     // Each basis eval mode pair
787     for (CeedInt eout = 0; eout < NUMEMODEOUT; eout++) {
788       const CeedScalar *bt = NULL;
789       if (emodeout[eout] == CEED_EVAL_GRAD)
790         dout += 1;
791       CeedOperatorGetBasisPointer_Cuda(&bt, emodeout[eout], identity, interpout,
792                                       &gradout[dout*NQPTS*NNODES]);
793       CeedInt din = -1;
794       for (CeedInt ein = 0; ein < NUMEMODEIN; ein++) {
795         const CeedScalar *b = NULL;
796         if (emodein[ein] == CEED_EVAL_GRAD)
797           din += 1;
798         CeedOperatorGetBasisPointer_Cuda(&b, emodein[ein], identity, interpin,
799                                         &gradin[din*NQPTS*NNODES]);
800         // Each component
801         for (CeedInt compOut = 0; compOut < NCOMP; compOut++) {
802           // Each qpoint/node pair
803           if (pointBlock) {
804             // Point Block Diagonal
805             for (CeedInt compIn = 0; compIn < NCOMP; compIn++) {
806               CeedScalar evalue = 0.;
807               for (CeedInt q = 0; q < NQPTS; q++) {
808                 const CeedScalar qfvalue =
809                   assembledqfarray[((((ein*NCOMP+compIn)*NUMEMODEOUT+eout)*
810                                      NCOMP+compOut)*nelem+e)*NQPTS+q];
811                 if (abs(qfvalue) > qfvaluebound)
812                   evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid];
813               }
814               elemdiagarray[((compOut*NCOMP+compIn)*nelem+e)*NNODES+tid] += evalue;
815             }
816           } else {
817             // Diagonal Only
818             CeedScalar evalue = 0.;
819             for (CeedInt q = 0; q < NQPTS; q++) {
820               const CeedScalar qfvalue =
821                 assembledqfarray[((((ein*NCOMP+compOut)*NUMEMODEOUT+eout)*
822                                    NCOMP+compOut)*nelem+e)*NQPTS+q];
823               if (abs(qfvalue) > qfvaluebound)
824                 evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid];
825             }
826             elemdiagarray[(compOut*nelem+e)*NNODES+tid] += evalue;
827           }
828         }
829       }
830     }
831   }
832 }
833 
834 //------------------------------------------------------------------------------
835 // Linear diagonal
836 //------------------------------------------------------------------------------
837 extern "C" __global__ void linearDiagonal(const CeedInt nelem,
838     const CeedScalar maxnorm, const CeedScalar *identity,
839     const CeedScalar *interpin, const CeedScalar *gradin,
840     const CeedScalar *interpout, const CeedScalar *gradout,
841     const CeedEvalMode *emodein, const CeedEvalMode *emodeout,
842     const CeedScalar *__restrict__ assembledqfarray,
843     CeedScalar *__restrict__ elemdiagarray) {
844   diagonalCore(nelem, maxnorm, false, identity, interpin, gradin, interpout,
845                gradout, emodein, emodeout, assembledqfarray, elemdiagarray);
846 }
847 
848 //------------------------------------------------------------------------------
849 // Linear point block diagonal
850 //------------------------------------------------------------------------------
851 extern "C" __global__ void linearPointBlockDiagonal(const CeedInt nelem,
852     const CeedScalar maxnorm, const CeedScalar *identity,
853     const CeedScalar *interpin, const CeedScalar *gradin,
854     const CeedScalar *interpout, const CeedScalar *gradout,
855     const CeedEvalMode *emodein, const CeedEvalMode *emodeout,
856     const CeedScalar *__restrict__ assembledqfarray,
857     CeedScalar *__restrict__ elemdiagarray) {
858   diagonalCore(nelem, maxnorm, true, identity, interpin, gradin, interpout,
859                gradout, emodein, emodeout, assembledqfarray, elemdiagarray);
860 }
861 
862 );
863 // *INDENT-ON*
864 
865 //------------------------------------------------------------------------------
866 // Create point block restriction
867 //------------------------------------------------------------------------------
868 static int CreatePBRestriction(CeedElemRestriction rstr,
869                                CeedElemRestriction *pbRstr) {
870   int ierr;
871   Ceed ceed;
872   ierr = CeedElemRestrictionGetCeed(rstr, &ceed); CeedChkBackend(ierr);
873   const CeedInt *offsets;
874   ierr = CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets);
875   CeedChkBackend(ierr);
876 
877   // Expand offsets
878   CeedInt nelem, ncomp, elemsize, compstride, max = 1, *pbOffsets;
879   ierr = CeedElemRestrictionGetNumElements(rstr, &nelem); CeedChkBackend(ierr);
880   ierr = CeedElemRestrictionGetNumComponents(rstr, &ncomp); CeedChkBackend(ierr);
881   ierr = CeedElemRestrictionGetElementSize(rstr, &elemsize); CeedChkBackend(ierr);
882   ierr = CeedElemRestrictionGetCompStride(rstr, &compstride);
883   CeedChkBackend(ierr);
884   CeedInt shift = ncomp;
885   if (compstride != 1)
886     shift *= ncomp;
887   ierr = CeedCalloc(nelem*elemsize, &pbOffsets); CeedChkBackend(ierr);
888   for (CeedInt i = 0; i < nelem*elemsize; i++) {
889     pbOffsets[i] = offsets[i]*shift;
890     if (pbOffsets[i] > max)
891       max = pbOffsets[i];
892   }
893 
894   // Create new restriction
895   ierr = CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp*ncomp, 1,
896                                    max + ncomp*ncomp, CEED_MEM_HOST,
897                                    CEED_OWN_POINTER, pbOffsets, pbRstr);
898   CeedChkBackend(ierr);
899 
900   // Cleanup
901   ierr = CeedElemRestrictionRestoreOffsets(rstr, &offsets); CeedChkBackend(ierr);
902 
903   return CEED_ERROR_SUCCESS;
904 }
905 
906 //------------------------------------------------------------------------------
907 // Assemble diagonal setup
908 //------------------------------------------------------------------------------
909 static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op,
910     const bool pointBlock) {
911   int ierr;
912   Ceed ceed;
913   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
914   CeedQFunction qf;
915   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
916   CeedInt numinputfields, numoutputfields;
917   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
918   CeedChkBackend(ierr);
919 
920   // Determine active input basis
921   CeedOperatorField *opfields;
922   CeedQFunctionField *qffields;
923   ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL);
924   CeedChkBackend(ierr);
925   ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL);
926   CeedChkBackend(ierr);
927   CeedInt numemodein = 0, ncomp = 0, dim = 1;
928   CeedEvalMode *emodein = NULL;
929   CeedBasis basisin = NULL;
930   CeedElemRestriction rstrin = NULL;
931   for (CeedInt i = 0; i < numinputfields; i++) {
932     CeedVector vec;
933     ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr);
934     if (vec == CEED_VECTOR_ACTIVE) {
935       CeedElemRestriction rstr;
936       ierr = CeedOperatorFieldGetBasis(opfields[i], &basisin); CeedChkBackend(ierr);
937       ierr = CeedBasisGetNumComponents(basisin, &ncomp); CeedChkBackend(ierr);
938       ierr = CeedBasisGetDimension(basisin, &dim); CeedChkBackend(ierr);
939       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr);
940       CeedChkBackend(ierr);
941       if (rstrin && rstrin != rstr)
942         // LCOV_EXCL_START
943         return CeedError(ceed, CEED_ERROR_BACKEND,
944                          "Multi-field non-composite operator diagonal assembly not supported");
945       // LCOV_EXCL_STOP
946       rstrin = rstr;
947       CeedEvalMode emode;
948       ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode);
949       CeedChkBackend(ierr);
950       switch (emode) {
951       case CEED_EVAL_NONE:
952       case CEED_EVAL_INTERP:
953         ierr = CeedRealloc(numemodein + 1, &emodein); CeedChkBackend(ierr);
954         emodein[numemodein] = emode;
955         numemodein += 1;
956         break;
957       case CEED_EVAL_GRAD:
958         ierr = CeedRealloc(numemodein + dim, &emodein); CeedChkBackend(ierr);
959         for (CeedInt d = 0; d < dim; d++)
960           emodein[numemodein+d] = emode;
961         numemodein += dim;
962         break;
963       case CEED_EVAL_WEIGHT:
964       case CEED_EVAL_DIV:
965       case CEED_EVAL_CURL:
966         break; // Caught by QF Assembly
967       }
968     }
969   }
970 
971   // Determine active output basis
972   ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields);
973   CeedChkBackend(ierr);
974   ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields);
975   CeedChkBackend(ierr);
976   CeedInt numemodeout = 0;
977   CeedEvalMode *emodeout = NULL;
978   CeedBasis basisout = NULL;
979   CeedElemRestriction rstrout = NULL;
980   for (CeedInt i = 0; i < numoutputfields; i++) {
981     CeedVector vec;
982     ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr);
983     if (vec == CEED_VECTOR_ACTIVE) {
984       CeedElemRestriction rstr;
985       ierr = CeedOperatorFieldGetBasis(opfields[i], &basisout); CeedChkBackend(ierr);
986       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr);
987       CeedChkBackend(ierr);
988       if (rstrout && rstrout != rstr)
989         // LCOV_EXCL_START
990         return CeedError(ceed, CEED_ERROR_BACKEND,
991                          "Multi-field non-composite operator diagonal assembly not supported");
992       // LCOV_EXCL_STOP
993       rstrout = rstr;
994       CeedEvalMode emode;
995       ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr);
996       switch (emode) {
997       case CEED_EVAL_NONE:
998       case CEED_EVAL_INTERP:
999         ierr = CeedRealloc(numemodeout + 1, &emodeout); CeedChkBackend(ierr);
1000         emodeout[numemodeout] = emode;
1001         numemodeout += 1;
1002         break;
1003       case CEED_EVAL_GRAD:
1004         ierr = CeedRealloc(numemodeout + dim, &emodeout); CeedChkBackend(ierr);
1005         for (CeedInt d = 0; d < dim; d++)
1006           emodeout[numemodeout+d] = emode;
1007         numemodeout += dim;
1008         break;
1009       case CEED_EVAL_WEIGHT:
1010       case CEED_EVAL_DIV:
1011       case CEED_EVAL_CURL:
1012         break; // Caught by QF Assembly
1013       }
1014     }
1015   }
1016 
1017   // Operator data struct
1018   CeedOperator_Cuda *impl;
1019   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1020   ierr = CeedCalloc(1, &impl->diag); CeedChkBackend(ierr);
1021   CeedOperatorDiag_Cuda *diag = impl->diag;
1022   diag->basisin = basisin;
1023   diag->basisout = basisout;
1024   diag->h_emodein = emodein;
1025   diag->h_emodeout = emodeout;
1026   diag->numemodein = numemodein;
1027   diag->numemodeout = numemodeout;
1028 
1029   // Assemble kernel
1030   CeedInt nnodes, nqpts;
1031   ierr = CeedBasisGetNumNodes(basisin, &nnodes); CeedChkBackend(ierr);
1032   ierr = CeedBasisGetNumQuadraturePoints(basisin, &nqpts); CeedChkBackend(ierr);
1033   diag->nnodes = nnodes;
1034   ierr = CeedCompileCuda(ceed, diagonalkernels, &diag->module, 5,
1035                          "NUMEMODEIN", numemodein,
1036                          "NUMEMODEOUT", numemodeout,
1037                          "NNODES", nnodes,
1038                          "NQPTS", nqpts,
1039                          "NCOMP", ncomp
1040                         ); CeedChk_Cu(ceed, ierr);
1041   ierr = CeedGetKernelCuda(ceed, diag->module, "linearDiagonal",
1042                            &diag->linearDiagonal); CeedChk_Cu(ceed, ierr);
1043   ierr = CeedGetKernelCuda(ceed, diag->module, "linearPointBlockDiagonal",
1044                            &diag->linearPointBlock);
1045   CeedChk_Cu(ceed, ierr);
1046 
1047   // Basis matrices
1048   const CeedInt qBytes = nqpts * sizeof(CeedScalar);
1049   const CeedInt iBytes = qBytes * nnodes;
1050   const CeedInt gBytes = qBytes * nnodes * dim;
1051   const CeedInt eBytes = sizeof(CeedEvalMode);
1052   const CeedScalar *interpin, *interpout, *gradin, *gradout;
1053 
1054   // CEED_EVAL_NONE
1055   CeedScalar *identity = NULL;
1056   bool evalNone = false;
1057   for (CeedInt i=0; i<numemodein; i++)
1058     evalNone = evalNone || (emodein[i] == CEED_EVAL_NONE);
1059   for (CeedInt i=0; i<numemodeout; i++)
1060     evalNone = evalNone || (emodeout[i] == CEED_EVAL_NONE);
1061   if (evalNone) {
1062     ierr = CeedCalloc(nqpts*nnodes, &identity); CeedChkBackend(ierr);
1063     for (CeedInt i=0; i<(nnodes<nqpts?nnodes:nqpts); i++)
1064       identity[i*nnodes+i] = 1.0;
1065     ierr = cudaMalloc((void **)&diag->d_identity, iBytes); CeedChk_Cu(ceed, ierr);
1066     ierr = cudaMemcpy(diag->d_identity, identity, iBytes,
1067                       cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1068   }
1069 
1070   // CEED_EVAL_INTERP
1071   ierr = CeedBasisGetInterp(basisin, &interpin); CeedChkBackend(ierr);
1072   ierr = cudaMalloc((void **)&diag->d_interpin, iBytes); CeedChk_Cu(ceed, ierr);
1073   ierr = cudaMemcpy(diag->d_interpin, interpin, iBytes,
1074                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1075   ierr = CeedBasisGetInterp(basisout, &interpout); CeedChkBackend(ierr);
1076   ierr = cudaMalloc((void **)&diag->d_interpout, iBytes); CeedChk_Cu(ceed, ierr);
1077   ierr = cudaMemcpy(diag->d_interpout, interpout, iBytes,
1078                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1079 
1080   // CEED_EVAL_GRAD
1081   ierr = CeedBasisGetGrad(basisin, &gradin); CeedChkBackend(ierr);
1082   ierr = cudaMalloc((void **)&diag->d_gradin, gBytes); CeedChk_Cu(ceed, ierr);
1083   ierr = cudaMemcpy(diag->d_gradin, gradin, gBytes,
1084                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1085   ierr = CeedBasisGetGrad(basisout, &gradout); CeedChkBackend(ierr);
1086   ierr = cudaMalloc((void **)&diag->d_gradout, gBytes); CeedChk_Cu(ceed, ierr);
1087   ierr = cudaMemcpy(diag->d_gradout, gradout, gBytes,
1088                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1089 
1090   // Arrays of emodes
1091   ierr = cudaMalloc((void **)&diag->d_emodein, numemodein * eBytes);
1092   CeedChk_Cu(ceed, ierr);
1093   ierr = cudaMemcpy(diag->d_emodein, emodein, numemodein * eBytes,
1094                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1095   ierr = cudaMalloc((void **)&diag->d_emodeout, numemodeout * eBytes);
1096   CeedChk_Cu(ceed, ierr);
1097   ierr = cudaMemcpy(diag->d_emodeout, emodeout, numemodeout * eBytes,
1098                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1099 
1100   // Restriction
1101   diag->diagrstr = rstrout;
1102 
1103   return CEED_ERROR_SUCCESS;
1104 }
1105 
1106 //------------------------------------------------------------------------------
1107 // Assemble diagonal common code
1108 //------------------------------------------------------------------------------
1109 static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op,
1110     CeedVector assembled, CeedRequest *request, const bool pointBlock) {
1111   int ierr;
1112   Ceed ceed;
1113   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1114   CeedOperator_Cuda *impl;
1115   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1116 
1117   // Assemble QFunction
1118   CeedVector assembledqf;
1119   CeedElemRestriction rstr;
1120   ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf,
1121          &rstr, request); CeedChkBackend(ierr);
1122   ierr = CeedElemRestrictionDestroy(&rstr); CeedChkBackend(ierr);
1123   CeedScalar maxnorm = 0;
1124   ierr = CeedVectorNorm(assembledqf, CEED_NORM_MAX, &maxnorm);
1125   CeedChkBackend(ierr);
1126 
1127   // Setup
1128   if (!impl->diag) {
1129     ierr = CeedOperatorAssembleDiagonalSetup_Cuda(op, pointBlock);
1130     CeedChkBackend(ierr);
1131   }
1132   CeedOperatorDiag_Cuda *diag = impl->diag;
1133   assert(diag != NULL);
1134 
1135   // Restriction
1136   if (pointBlock && !diag->pbdiagrstr) {
1137     CeedElemRestriction pbdiagrstr;
1138     ierr = CreatePBRestriction(diag->diagrstr, &pbdiagrstr); CeedChkBackend(ierr);
1139     diag->pbdiagrstr = pbdiagrstr;
1140   }
1141   CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr;
1142 
1143   // Create diagonal vector
1144   CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag;
1145   if (!elemdiag) {
1146     ierr = CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag);
1147     CeedChkBackend(ierr);
1148     if (pointBlock)
1149       diag->pbelemdiag = elemdiag;
1150     else
1151       diag->elemdiag = elemdiag;
1152   }
1153   ierr = CeedVectorSetValue(elemdiag, 0.0); CeedChkBackend(ierr);
1154 
1155   // Assemble element operator diagonals
1156   CeedScalar *elemdiagarray;
1157   const CeedScalar *assembledqfarray;
1158   ierr = CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray);
1159   CeedChkBackend(ierr);
1160   ierr = CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray);
1161   CeedChkBackend(ierr);
1162   CeedInt nelem;
1163   ierr = CeedElemRestrictionGetNumElements(diagrstr, &nelem);
1164   CeedChkBackend(ierr);
1165 
1166   // Compute the diagonal of B^T D B
1167   int elemsPerBlock = 1;
1168   int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0);
1169   void *args[] = {(void *) &nelem, (void *) &maxnorm, &diag->d_identity,
1170                   &diag->d_interpin, &diag->d_gradin, &diag->d_interpout,
1171                   &diag->d_gradout, &diag->d_emodein, &diag->d_emodeout,
1172                   &assembledqfarray, &elemdiagarray
1173                  };
1174   if (pointBlock) {
1175     ierr = CeedRunKernelDimCuda(ceed, diag->linearPointBlock, grid,
1176                                 diag->nnodes, 1, elemsPerBlock, args);
1177     CeedChkBackend(ierr);
1178   } else {
1179     ierr = CeedRunKernelDimCuda(ceed, diag->linearDiagonal, grid,
1180                                 diag->nnodes, 1, elemsPerBlock, args);
1181     CeedChkBackend(ierr);
1182   }
1183 
1184   // Restore arrays
1185   ierr = CeedVectorRestoreArray(elemdiag, &elemdiagarray); CeedChkBackend(ierr);
1186   ierr = CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray);
1187   CeedChkBackend(ierr);
1188 
1189   // Assemble local operator diagonal
1190   ierr = CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag,
1191                                   assembled, request); CeedChkBackend(ierr);
1192 
1193   // Cleanup
1194   ierr = CeedVectorDestroy(&assembledqf); CeedChkBackend(ierr);
1195 
1196   return CEED_ERROR_SUCCESS;
1197 }
1198 
1199 //------------------------------------------------------------------------------
1200 // Assemble composite diagonal common code
1201 //------------------------------------------------------------------------------
1202 static inline int CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda(
1203   CeedOperator op, CeedVector assembled, CeedRequest *request,
1204   const bool pointBlock) {
1205   int ierr;
1206   CeedInt numSub;
1207   CeedOperator *subOperators;
1208   ierr = CeedOperatorGetNumSub(op, &numSub); CeedChkBackend(ierr);
1209   ierr = CeedOperatorGetSubList(op, &subOperators); CeedChkBackend(ierr);
1210   for (CeedInt i = 0; i < numSub; i++) {
1211     ierr = CeedOperatorAssembleDiagonalCore_Cuda(subOperators[i], assembled,
1212            request, pointBlock); CeedChkBackend(ierr);
1213   }
1214   return CEED_ERROR_SUCCESS;
1215 }
1216 
1217 //------------------------------------------------------------------------------
1218 // Assemble Linear Diagonal
1219 //------------------------------------------------------------------------------
1220 static int CeedOperatorLinearAssembleAddDiagonal_Cuda(CeedOperator op,
1221     CeedVector assembled, CeedRequest *request) {
1222   int ierr;
1223   bool isComposite;
1224   ierr = CeedOperatorIsComposite(op, &isComposite); CeedChkBackend(ierr);
1225   if (isComposite) {
1226     return CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda(op, assembled,
1227            request, false);
1228   } else {
1229     return CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, false);
1230   }
1231 }
1232 
1233 //------------------------------------------------------------------------------
1234 // Assemble Linear Point Block Diagonal
1235 //------------------------------------------------------------------------------
1236 static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op,
1237     CeedVector assembled, CeedRequest *request) {
1238   int ierr;
1239   bool isComposite;
1240   ierr = CeedOperatorIsComposite(op, &isComposite); CeedChkBackend(ierr);
1241   if (isComposite) {
1242     return CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda(op, assembled,
1243            request, true);
1244   } else {
1245     return CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, true);
1246   }
1247 }
1248 
1249 //------------------------------------------------------------------------------
1250 // Matrix assembly kernel for low-order elements (2D thread block)
1251 //------------------------------------------------------------------------------
1252 // *INDENT-OFF*
1253 static const char *assemblykernel = QUOTE(
1254 extern "C" __launch_bounds__(BLOCK_SIZE)
1255            __global__ void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out,
1256                    const CeedScalar *__restrict__ qf_array,
1257                    CeedScalar *__restrict__ values_array) {
1258 
1259   // This kernel assumes B_in and B_out have the same number of quadrature points and
1260   // basis points.
1261   // TODO: expand to more general cases
1262   const int i = threadIdx.x; // The output row index of each B^TDB operation
1263   const int l = threadIdx.y; // The output column index of each B^TDB operation
1264 			     // such that we have (Bout^T)_ij D_jk Bin_kl = C_il
1265 
1266   // Strides for final output ordering, determined by the reference (interface) implementation of
1267   // the symbolic assembly, slowest --> fastest: element, comp_in, comp_out, node_row, node_col
1268   const CeedInt comp_out_stride = NNODES * NNODES;
1269   const CeedInt comp_in_stride = comp_out_stride * NCOMP;
1270   const CeedInt e_stride = comp_in_stride * NCOMP;
1271   // Strides for QF array, slowest --> fastest:  emode_in, comp_in, emode_out, comp_out, elem, qpt
1272   const CeedInt qe_stride = NQPTS;
1273   const CeedInt qcomp_out_stride = NELEM * qe_stride;
1274   const CeedInt qemode_out_stride = qcomp_out_stride * NCOMP;
1275   const CeedInt qcomp_in_stride = qemode_out_stride * NUMEMODEOUT;
1276   const CeedInt qemode_in_stride = qcomp_in_stride * NCOMP;
1277 
1278   // Loop over each element (if necessary)
1279   for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < NELEM;
1280          e += gridDim.x*blockDim.z) {
1281     for (CeedInt comp_in = 0; comp_in < NCOMP; comp_in++) {
1282       for (CeedInt comp_out = 0; comp_out < NCOMP; comp_out++) {
1283         CeedScalar result = 0.0;
1284         CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e;
1285         for (CeedInt emode_in = 0; emode_in < NUMEMODEIN; emode_in++) {
1286           CeedInt b_in_index = emode_in * NQPTS * NNODES;
1287       	  for (CeedInt emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) {
1288              CeedInt b_out_index = emode_out * NQPTS * NNODES;
1289              CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in;
1290  	     // Perform the B^T D B operation for this 'chunk' of D (the qf_array)
1291             for (CeedInt j = 0; j < NQPTS; j++) {
1292      	      result += B_out[b_out_index + j * NNODES  + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l];
1293 	    }
1294 
1295           }// end of emode_out
1296         } // end of emode_in
1297         CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l;
1298    	values_array[val_index] = result;
1299       } // end of out component
1300     } // end of in component
1301   } // end of element loop
1302 }
1303 );
1304 
1305 //------------------------------------------------------------------------------
1306 // Fallback kernel for larger orders (1D thread block)
1307 //------------------------------------------------------------------------------
1308 static const char *assemblykernelbigelem = QUOTE(
1309 extern "C" __launch_bounds__(BLOCK_SIZE)
1310            __global__ void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out,
1311                    const CeedScalar *__restrict__ qf_array,
1312                    CeedScalar *__restrict__ values_array) {
1313 
1314   // This kernel assumes B_in and B_out have the same number of quadrature points and
1315   // basis points.
1316   // TODO: expand to more general cases
1317   const int l = threadIdx.x; // The output column index of each B^TDB operation
1318 			     // such that we have (Bout^T)_ij D_jk Bin_kl = C_il
1319 
1320   // Strides for final output ordering, determined by the reference (interface) implementation of
1321   // the symbolic assembly, slowest --> fastest: element, comp_in, comp_out, node_row, node_col
1322   const CeedInt comp_out_stride = NNODES * NNODES;
1323   const CeedInt comp_in_stride = comp_out_stride * NCOMP;
1324   const CeedInt e_stride = comp_in_stride * NCOMP;
1325   // Strides for QF array, slowest --> fastest:  emode_in, comp_in, emode_out, comp_out, elem, qpt
1326   const CeedInt qe_stride = NQPTS;
1327   const CeedInt qcomp_out_stride = NELEM * qe_stride;
1328   const CeedInt qemode_out_stride = qcomp_out_stride * NCOMP;
1329   const CeedInt qcomp_in_stride = qemode_out_stride * NUMEMODEOUT;
1330   const CeedInt qemode_in_stride = qcomp_in_stride * NCOMP;
1331 
1332     // Loop over each element (if necessary)
1333   for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < NELEM;
1334          e += gridDim.x*blockDim.z) {
1335     for (CeedInt comp_in = 0; comp_in < NCOMP; comp_in++) {
1336       for (CeedInt comp_out = 0; comp_out < NCOMP; comp_out++) {
1337         for (CeedInt i = 0; i < NNODES; i++) {
1338           CeedScalar result = 0.0;
1339           CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e;
1340           for (CeedInt emode_in = 0; emode_in < NUMEMODEIN; emode_in++) {
1341             CeedInt b_in_index = emode_in * NQPTS * NNODES;
1342         	  for (CeedInt emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) {
1343                CeedInt b_out_index = emode_out * NQPTS * NNODES;
1344                CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in;
1345    	     // Perform the B^T D B operation for this 'chunk' of D (the qf_array)
1346               for (CeedInt j = 0; j < NQPTS; j++) {
1347        	      result += B_out[b_out_index + j * NNODES  + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l];
1348   	    }
1349 
1350             }// end of emode_out
1351           } // end of emode_in
1352           CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l;
1353      	  values_array[val_index] = result;
1354         } // end of loop over element node index, i
1355       } // end of out component
1356     } // end of in component
1357   } // end of element loop
1358 }
1359 );
1360 // *INDENT-ON*
1361 
1362 //------------------------------------------------------------------------------
1363 // Single operator assembly setup
1364 //------------------------------------------------------------------------------
1365 static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op) {
1366   int ierr;
1367   Ceed ceed;
1368   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1369   CeedOperator_Cuda *impl;
1370   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1371 
1372   // Get intput and output fields
1373   CeedInt num_input_fields, num_output_fields;
1374   CeedOperatorField *input_fields;
1375   CeedOperatorField *output_fields;
1376   ierr = CeedOperatorGetFields(op, &num_input_fields, &input_fields,
1377                                &num_output_fields, &output_fields); CeedChkBackend(ierr);
1378 
1379   // Determine active input basis eval mode
1380   CeedQFunction qf;
1381   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
1382   CeedQFunctionField *qf_fields;
1383   ierr = CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL);
1384   CeedChkBackend(ierr);
1385   // Note that the kernel will treat each dimension of a gradient action separately;
1386   // i.e., when an active input has a CEED_EVAL_GRAD mode, num_emode_in will increment
1387   // by dim.  However, for the purposes of loading the B matrices, it will be treated
1388   // as one mode, and we will load/copy the entire gradient matrix at once, so
1389   // num_B_in_mats_to_load will be incremented by 1.
1390   CeedInt num_emode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0;
1391   CeedEvalMode *eval_mode_in = NULL; //will be of size num_B_in_mats_load
1392   CeedBasis basis_in = NULL;
1393   CeedInt nqpts = 0, esize = 0;
1394   CeedElemRestriction rstr_in = NULL;
1395   for (CeedInt i=0; i<num_input_fields; i++) {
1396     CeedVector vec;
1397     ierr = CeedOperatorFieldGetVector(input_fields[i], &vec); CeedChkBackend(ierr);
1398     if (vec == CEED_VECTOR_ACTIVE) {
1399       ierr = CeedOperatorFieldGetBasis(input_fields[i], &basis_in);
1400       CeedChkBackend(ierr);
1401       ierr = CeedBasisGetDimension(basis_in, &dim); CeedChkBackend(ierr);
1402       ierr = CeedBasisGetNumQuadraturePoints(basis_in, &nqpts); CeedChkBackend(ierr);
1403       ierr = CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in);
1404       CeedChkBackend(ierr);
1405       ierr = CeedElemRestrictionGetElementSize(rstr_in, &esize); CeedChkBackend(ierr);
1406       CeedEvalMode eval_mode;
1407       ierr = CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode);
1408       CeedChkBackend(ierr);
1409       if (eval_mode != CEED_EVAL_NONE) {
1410         ierr = CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in);
1411         CeedChkBackend(ierr);
1412         eval_mode_in[num_B_in_mats_to_load] = eval_mode;
1413         num_B_in_mats_to_load += 1;
1414         if (eval_mode == CEED_EVAL_GRAD) {
1415           num_emode_in += dim;
1416           size_B_in += dim * esize * nqpts;
1417         } else {
1418           num_emode_in +=1;
1419           size_B_in += esize * nqpts;
1420         }
1421       }
1422     }
1423   }
1424 
1425   // Determine active output basis; basis_out and rstr_out only used if same as input, TODO
1426   ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields);
1427   CeedChkBackend(ierr);
1428   CeedInt num_emode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0;
1429   CeedEvalMode *eval_mode_out = NULL;
1430   CeedBasis basis_out = NULL;
1431   CeedElemRestriction rstr_out = NULL;
1432   for (CeedInt i=0; i<num_output_fields; i++) {
1433     CeedVector vec;
1434     ierr = CeedOperatorFieldGetVector(output_fields[i], &vec); CeedChkBackend(ierr);
1435     if (vec == CEED_VECTOR_ACTIVE) {
1436       ierr = CeedOperatorFieldGetBasis(output_fields[i], &basis_out);
1437       CeedChkBackend(ierr);
1438       ierr = CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out);
1439       CeedChkBackend(ierr);
1440       if (rstr_out && rstr_out != rstr_in)
1441         // LCOV_EXCL_START
1442         return CeedError(ceed, CEED_ERROR_BACKEND,
1443                          "Multi-field non-composite operator assembly not supported");
1444       // LCOV_EXCL_STOP
1445       CeedEvalMode eval_mode;
1446       ierr = CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode);
1447       CeedChkBackend(ierr);
1448       if (eval_mode != CEED_EVAL_NONE) {
1449         ierr = CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out);
1450         CeedChkBackend(ierr);
1451         eval_mode_out[num_B_out_mats_to_load] = eval_mode;
1452         num_B_out_mats_to_load += 1;
1453         if (eval_mode == CEED_EVAL_GRAD) {
1454           num_emode_out += dim;
1455           size_B_out += dim * esize * nqpts;
1456         } else {
1457           num_emode_out +=1;
1458           size_B_out += esize * nqpts;
1459         }
1460       }
1461     }
1462   }
1463 
1464   if (num_emode_in == 0 || num_emode_out == 0)
1465     // LCOV_EXCL_START
1466     return CeedError(ceed, CEED_ERROR_UNSUPPORTED,
1467                      "Cannot assemble operator without inputs/outputs");
1468   // LCOV_EXCL_STOP
1469 
1470   CeedInt nelem, ncomp;
1471   ierr = CeedElemRestrictionGetNumElements(rstr_in, &nelem); CeedChkBackend(ierr);
1472   ierr = CeedElemRestrictionGetNumComponents(rstr_in, &ncomp);
1473   CeedChkBackend(ierr);
1474 
1475   ierr = CeedCalloc(1, &impl->asmb); CeedChkBackend(ierr);
1476   CeedOperatorAssemble_Cuda *asmb = impl->asmb;
1477   asmb->nelem = nelem;
1478 
1479   // Compile kernels
1480   int elemsPerBlock = 1;
1481   asmb->elemsPerBlock = elemsPerBlock;
1482   CeedInt block_size = esize * esize * elemsPerBlock;
1483   Ceed_Cuda *cuda_data;
1484   ierr = CeedGetData(ceed, &cuda_data); CeedChkBackend(ierr);
1485   if (block_size > cuda_data->device_prop.maxThreadsPerBlock) {
1486     // Use fallback kernel with 1D threadblock
1487     block_size = esize * elemsPerBlock;
1488     asmb->block_size_x = esize;
1489     asmb->block_size_y = 1;
1490     ierr = CeedCompileCuda(ceed, assemblykernelbigelem, &asmb->module, 7,
1491                            "NELEM", nelem,
1492                            "NUMEMODEIN", num_emode_in,
1493                            "NUMEMODEOUT", num_emode_out,
1494                            "NQPTS", nqpts,
1495                            "NNODES", esize,
1496                            "BLOCK_SIZE", block_size,
1497                            "NCOMP", ncomp
1498                           ); CeedChk_Cu(ceed, ierr);
1499   } else {  // Use kernel with 2D threadblock
1500     asmb->block_size_x = esize;
1501     asmb->block_size_y = esize;
1502     ierr = CeedCompileCuda(ceed, assemblykernel, &asmb->module, 7,
1503                            "NELEM", nelem,
1504                            "NUMEMODEIN", num_emode_in,
1505                            "NUMEMODEOUT", num_emode_out,
1506                            "NQPTS", nqpts,
1507                            "NNODES", esize,
1508                            "BLOCK_SIZE", block_size,
1509                            "NCOMP", ncomp
1510                           ); CeedChk_Cu(ceed, ierr);
1511   }
1512   ierr = CeedGetKernelCuda(ceed, asmb->module, "linearAssemble",
1513                            &asmb->linearAssemble); CeedChk_Cu(ceed, ierr);
1514 
1515   // Build 'full' B matrices (not 1D arrays used for tensor-product matrices)
1516   const CeedScalar *interp_in, *grad_in;
1517   ierr = CeedBasisGetInterp(basis_in, &interp_in); CeedChkBackend(ierr);
1518   ierr = CeedBasisGetGrad(basis_in, &grad_in); CeedChkBackend(ierr);
1519 
1520   // Load into B_in, in order that they will be used in eval_mode
1521   const CeedInt inBytes = size_B_in * sizeof(CeedScalar);
1522   CeedInt mat_start = 0;
1523   ierr = cudaMalloc((void **) &asmb->d_B_in, inBytes); CeedChk_Cu(ceed, ierr);
1524   for (int i = 0; i < num_B_in_mats_to_load; i++) {
1525     CeedEvalMode eval_mode = eval_mode_in[i];
1526     if (eval_mode == CEED_EVAL_INTERP) {
1527       ierr = cudaMemcpy(&asmb->d_B_in[mat_start], interp_in,
1528                         esize * nqpts * sizeof(CeedScalar),
1529                         cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1530       mat_start += esize * nqpts;
1531     } else if (eval_mode == CEED_EVAL_GRAD) {
1532       ierr = cudaMemcpy(asmb->d_B_in, grad_in,
1533                         dim * esize * nqpts * sizeof(CeedScalar),
1534                         cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1535       mat_start += dim * esize * nqpts;
1536     }
1537   }
1538 
1539   const CeedScalar *interp_out, *grad_out;
1540   // Note that this function currently assumes 1 basis, so this should always be true
1541   // for now
1542   if (basis_out == basis_in) {
1543     interp_out = interp_in;
1544     grad_out = grad_in;
1545   } else {
1546     ierr = CeedBasisGetInterp(basis_out, &interp_out); CeedChkBackend(ierr);
1547     ierr = CeedBasisGetGrad(basis_out, &grad_out); CeedChkBackend(ierr);
1548   }
1549 
1550   // Load into B_out, in order that they will be used in eval_mode
1551   const CeedInt outBytes = size_B_out * sizeof(CeedScalar);
1552   mat_start = 0;
1553   ierr = cudaMalloc((void **) &asmb->d_B_out, outBytes); CeedChk_Cu(ceed, ierr);
1554   for (int i = 0; i < num_B_out_mats_to_load; i++) {
1555     CeedEvalMode eval_mode = eval_mode_out[i];
1556     if (eval_mode == CEED_EVAL_INTERP) {
1557       ierr = cudaMemcpy(&asmb->d_B_out[mat_start], interp_out,
1558                         esize * nqpts * sizeof(CeedScalar),
1559                         cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1560       mat_start += esize * nqpts;
1561     } else if (eval_mode == CEED_EVAL_GRAD) {
1562       ierr = cudaMemcpy(&asmb->d_B_out[mat_start], grad_out,
1563                         dim * esize * nqpts * sizeof(CeedScalar),
1564                         cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1565       mat_start += dim * esize * nqpts;
1566     }
1567   }
1568   return CEED_ERROR_SUCCESS;
1569 }
1570 
1571 //------------------------------------------------------------------------------
1572 // Single operator assembly
1573 //------------------------------------------------------------------------------
1574 static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset,
1575     CeedVector values) {
1576 
1577   int ierr;
1578   Ceed ceed;
1579   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1580   CeedOperator_Cuda *impl;
1581   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1582 
1583   // Setup
1584   if (!impl->asmb) {
1585     ierr = CeedSingleOperatorAssembleSetup_Cuda(op);
1586     CeedChkBackend(ierr);
1587     assert(impl->asmb != NULL);
1588   }
1589 
1590   // Assemble QFunction
1591   CeedVector assembled_qf;
1592   CeedElemRestriction rstr_q;
1593   ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(
1594            op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE); CeedChkBackend(ierr);
1595   ierr = CeedElemRestrictionDestroy(&rstr_q); CeedChkBackend(ierr);
1596   CeedScalar *values_array;
1597   ierr = CeedVectorGetArrayWrite(values, CEED_MEM_DEVICE, &values_array);
1598   CeedChkBackend(ierr);
1599   values_array += offset;
1600   const CeedScalar *qf_array;
1601   ierr = CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array);
1602   CeedChkBackend(ierr);
1603 
1604   // Compute B^T D B
1605   const CeedInt nelem = impl->asmb->nelem;
1606   const CeedInt elemsPerBlock = impl->asmb->elemsPerBlock;
1607   const CeedInt grid = nelem/elemsPerBlock+((
1608                          nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0);
1609   void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out,
1610                   &qf_array, &values_array
1611                  };
1612   ierr = CeedRunKernelDimCuda(ceed, impl->asmb->linearAssemble, grid,
1613                               impl->asmb->block_size_x, impl->asmb->block_size_y,
1614                               elemsPerBlock, args);
1615   CeedChkBackend(ierr);
1616 
1617 
1618   // Restore arrays
1619   ierr = CeedVectorRestoreArray(values, &values_array); CeedChkBackend(ierr);
1620   ierr = CeedVectorRestoreArrayRead(assembled_qf, &qf_array);
1621   CeedChkBackend(ierr);
1622 
1623   // Cleanup
1624   ierr = CeedVectorDestroy(&assembled_qf); CeedChkBackend(ierr);
1625 
1626   return CEED_ERROR_SUCCESS;
1627 }
1628 
1629 //------------------------------------------------------------------------------
1630 // Assemble matrix data for COO matrix of assembled operator.
1631 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic.
1632 //
1633 // Note that this (and other assembly routines) currently assume only one
1634 // active input restriction/basis per operator (could have multiple basis eval
1635 // modes).
1636 // TODO: allow multiple active input restrictions/basis objects
1637 //------------------------------------------------------------------------------
1638 int CeedOperatorLinearAssemble_Cuda(CeedOperator op, CeedVector values) {
1639 
1640   // As done in the default implementation, loop through suboperators
1641   // for composite operators, or call single operator assembly otherwise
1642   bool is_composite;
1643   CeedInt ierr;
1644   ierr = CeedOperatorIsComposite(op, &is_composite); CeedChkBackend(ierr);
1645 
1646   CeedElemRestriction rstr;
1647   CeedInt num_elem, elem_size, num_comp;
1648 
1649   CeedInt offset = 0;
1650   if (is_composite) {
1651     CeedInt num_suboperators;
1652     ierr = CeedOperatorGetNumSub(op, &num_suboperators); CeedChkBackend(ierr);
1653     CeedOperator *sub_operators;
1654     ierr = CeedOperatorGetSubList(op, &sub_operators); CeedChkBackend(ierr);
1655     for (int k = 0; k < num_suboperators; ++k) {
1656       ierr = CeedSingleOperatorAssemble_Cuda(sub_operators[k], offset, values);
1657       CeedChkBackend(ierr);
1658       ierr = CeedOperatorGetActiveElemRestriction(sub_operators[k], &rstr);
1659       CeedChkBackend(ierr);
1660       ierr = CeedElemRestrictionGetNumElements(rstr, &num_elem); CeedChkBackend(ierr);
1661       ierr = CeedElemRestrictionGetElementSize(rstr, &elem_size);
1662       CeedChkBackend(ierr);
1663       ierr = CeedElemRestrictionGetNumComponents(rstr, &num_comp);
1664       CeedChkBackend(ierr);
1665       offset += elem_size*num_comp * elem_size*num_comp * num_elem;
1666     }
1667   } else {
1668     ierr = CeedSingleOperatorAssemble_Cuda(op, offset, values);
1669     CeedChkBackend(ierr);
1670   }
1671 
1672   return CEED_ERROR_SUCCESS;
1673 }
1674 //------------------------------------------------------------------------------
1675 // Create operator
1676 //------------------------------------------------------------------------------
1677 int CeedOperatorCreate_Cuda(CeedOperator op) {
1678   int ierr;
1679   Ceed ceed;
1680   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1681   CeedOperator_Cuda *impl;
1682 
1683   ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr);
1684   ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr);
1685 
1686   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction",
1687                                 CeedOperatorLinearAssembleQFunction_Cuda);
1688   CeedChkBackend(ierr);
1689   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1690                                 "LinearAssembleQFunctionUpdate",
1691                                 CeedOperatorLinearAssembleQFunctionUpdate_Cuda);
1692   CeedChkBackend(ierr);
1693   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal",
1694                                 CeedOperatorLinearAssembleAddDiagonal_Cuda);
1695   CeedChkBackend(ierr);
1696   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1697                                 "LinearAssembleAddPointBlockDiagonal",
1698                                 CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda);
1699   CeedChkBackend(ierr);
1700   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1701                                 "LinearAssemble", CeedOperatorLinearAssemble_Cuda);
1702   CeedChkBackend(ierr);
1703   ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd",
1704                                 CeedOperatorApplyAdd_Cuda); CeedChkBackend(ierr);
1705   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy",
1706                                 CeedOperatorDestroy_Cuda); CeedChkBackend(ierr);
1707   return CEED_ERROR_SUCCESS;
1708 }
1709 
1710 //------------------------------------------------------------------------------
1711 // Composite Operator Create
1712 //------------------------------------------------------------------------------
1713 int CeedCompositeOperatorCreate_Cuda(CeedOperator op) {
1714   int ierr;
1715   Ceed ceed;
1716   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1717 
1718   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal",
1719                                 CeedOperatorLinearAssembleAddDiagonal_Cuda);
1720   CeedChkBackend(ierr);
1721   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1722                                 "LinearAssembleAddPointBlockDiagonal",
1723                                 CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda);
1724   CeedChkBackend(ierr);
1725   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1726                                 "LinearAssemble", CeedOperatorLinearAssemble_Cuda);
1727   CeedChkBackend(ierr);
1728   return CEED_ERROR_SUCCESS;
1729 }
1730 //------------------------------------------------------------------------------
1731