xref: /libCEED/backends/cuda-ref/ceed-cuda-ref-operator.c (revision 60bfe8bb0f69ce42012a1db84ec2f2f6908cc98b)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/ceed.h>
9 #include <ceed/backend.h>
10 #include <assert.h>
11 #include <cuda.h>
12 #include <cuda_runtime.h>
13 #include <stdbool.h>
14 #include <string.h>
15 #include "ceed-cuda-ref.h"
16 #include "../cuda/ceed-cuda-compile.h"
17 
18 //------------------------------------------------------------------------------
19 // Destroy operator
20 //------------------------------------------------------------------------------
21 static int CeedOperatorDestroy_Cuda(CeedOperator op) {
22   int ierr;
23   CeedOperator_Cuda *impl;
24   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
25 
26   // Apply data
27   for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) {
28     ierr = CeedVectorDestroy(&impl->evecs[i]); CeedChkBackend(ierr);
29   }
30   ierr = CeedFree(&impl->evecs); CeedChkBackend(ierr);
31 
32   for (CeedInt i = 0; i < impl->numein; i++) {
33     ierr = CeedVectorDestroy(&impl->qvecsin[i]); CeedChkBackend(ierr);
34   }
35   ierr = CeedFree(&impl->qvecsin); CeedChkBackend(ierr);
36 
37   for (CeedInt i = 0; i < impl->numeout; i++) {
38     ierr = CeedVectorDestroy(&impl->qvecsout[i]); CeedChkBackend(ierr);
39   }
40   ierr = CeedFree(&impl->qvecsout); CeedChkBackend(ierr);
41 
42   // QFunction assembly data
43   for (CeedInt i=0; i<impl->qfnumactivein; i++) {
44     ierr = CeedVectorDestroy(&impl->qfactivein[i]); CeedChkBackend(ierr);
45   }
46   ierr = CeedFree(&impl->qfactivein); CeedChkBackend(ierr);
47 
48   // Diag data
49   if (impl->diag) {
50     Ceed ceed;
51     ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
52     CeedChk_Cu(ceed, cuModuleUnload(impl->diag->module));
53     ierr = CeedFree(&impl->diag->h_emodein); CeedChkBackend(ierr);
54     ierr = CeedFree(&impl->diag->h_emodeout); CeedChkBackend(ierr);
55     ierr = cudaFree(impl->diag->d_emodein); CeedChk_Cu(ceed, ierr);
56     ierr = cudaFree(impl->diag->d_emodeout); CeedChk_Cu(ceed, ierr);
57     ierr = cudaFree(impl->diag->d_identity); CeedChk_Cu(ceed, ierr);
58     ierr = cudaFree(impl->diag->d_interpin); CeedChk_Cu(ceed, ierr);
59     ierr = cudaFree(impl->diag->d_interpout); CeedChk_Cu(ceed, ierr);
60     ierr = cudaFree(impl->diag->d_gradin); CeedChk_Cu(ceed, ierr);
61     ierr = cudaFree(impl->diag->d_gradout); CeedChk_Cu(ceed, ierr);
62     ierr = CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr);
63     CeedChkBackend(ierr);
64     ierr = CeedVectorDestroy(&impl->diag->elemdiag); CeedChkBackend(ierr);
65     ierr = CeedVectorDestroy(&impl->diag->pbelemdiag); CeedChkBackend(ierr);
66   }
67   ierr = CeedFree(&impl->diag); CeedChkBackend(ierr);
68 
69   if (impl->asmb) {
70     Ceed ceed;
71     ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
72     CeedChk_Cu(ceed, cuModuleUnload(impl->asmb->module));
73     ierr = cudaFree(impl->asmb->d_B_in); CeedChk_Cu(ceed, ierr);
74     ierr = cudaFree(impl->asmb->d_B_out); CeedChk_Cu(ceed, ierr);
75   }
76   ierr = CeedFree(&impl->asmb); CeedChkBackend(ierr);
77 
78   ierr = CeedFree(&impl); CeedChkBackend(ierr);
79   return CEED_ERROR_SUCCESS;
80 }
81 
82 //------------------------------------------------------------------------------
83 // Setup infields or outfields
84 //------------------------------------------------------------------------------
85 static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op,
86                                         bool isinput, CeedVector *evecs,
87                                         CeedVector *qvecs, CeedInt starte,
88                                         CeedInt numfields, CeedInt Q,
89                                         CeedInt numelements) {
90   CeedInt dim, ierr, size;
91   CeedSize q_size;
92   Ceed ceed;
93   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
94   CeedBasis basis;
95   CeedElemRestriction Erestrict;
96   CeedOperatorField *opfields;
97   CeedQFunctionField *qffields;
98   CeedVector fieldvec;
99   bool strided;
100   bool skiprestrict;
101 
102   if (isinput) {
103     ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL);
104     CeedChkBackend(ierr);
105     ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL);
106     CeedChkBackend(ierr);
107   } else {
108     ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields);
109     CeedChkBackend(ierr);
110     ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields);
111     CeedChkBackend(ierr);
112   }
113 
114   // Loop over fields
115   for (CeedInt i = 0; i < numfields; i++) {
116     CeedEvalMode emode;
117     ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr);
118 
119     strided = false;
120     skiprestrict = false;
121     if (emode != CEED_EVAL_WEIGHT) {
122       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict);
123       CeedChkBackend(ierr);
124 
125       // Check whether this field can skip the element restriction:
126       // must be passive input, with emode NONE, and have a strided restriction with
127       // CEED_STRIDES_BACKEND.
128 
129       // First, check whether the field is input or output:
130       if (isinput) {
131         // Check for passive input:
132         ierr = CeedOperatorFieldGetVector(opfields[i], &fieldvec); CeedChkBackend(ierr);
133         if (fieldvec != CEED_VECTOR_ACTIVE) {
134           // Check emode
135           if (emode == CEED_EVAL_NONE) {
136             // Check for strided restriction
137             ierr = CeedElemRestrictionIsStrided(Erestrict, &strided);
138             CeedChkBackend(ierr);
139             if (strided) {
140               // Check if vector is already in preferred backend ordering
141               ierr = CeedElemRestrictionHasBackendStrides(Erestrict,
142                      &skiprestrict); CeedChkBackend(ierr);
143             }
144           }
145         }
146       }
147       if (skiprestrict) {
148         // We do not need an E-Vector, but will use the input field vector's data
149         // directly in the operator application.
150         evecs[i + starte] = NULL;
151       } else {
152         ierr = CeedElemRestrictionCreateVector(Erestrict, NULL,
153                                                &evecs[i + starte]);
154         CeedChkBackend(ierr);
155       }
156     }
157 
158     switch (emode) {
159     case CEED_EVAL_NONE:
160       ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr);
161       q_size = (CeedSize)numelements * Q * size;
162       ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr);
163       break;
164     case CEED_EVAL_INTERP:
165       ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr);
166       q_size = (CeedSize)numelements * Q * size;
167       ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr);
168       break;
169     case CEED_EVAL_GRAD:
170       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr);
171       ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr);
172       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
173       q_size = (CeedSize)numelements * Q * size;
174       ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr);
175       break;
176     case CEED_EVAL_WEIGHT: // Only on input fields
177       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr);
178       q_size = (CeedSize)numelements * Q;
179       ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr);
180       ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE,
181                             CEED_EVAL_WEIGHT, NULL, qvecs[i]); CeedChkBackend(ierr);
182       break;
183     case CEED_EVAL_DIV:
184       break; // TODO: Not implemented
185     case CEED_EVAL_CURL:
186       break; // TODO: Not implemented
187     }
188   }
189   return CEED_ERROR_SUCCESS;
190 }
191 
192 //------------------------------------------------------------------------------
193 // CeedOperator needs to connect all the named fields (be they active or passive)
194 //   to the named inputs and outputs of its CeedQFunction.
195 //------------------------------------------------------------------------------
196 static int CeedOperatorSetup_Cuda(CeedOperator op) {
197   int ierr;
198   bool setupdone;
199   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
200   if (setupdone)
201     return CEED_ERROR_SUCCESS;
202   Ceed ceed;
203   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
204   CeedOperator_Cuda *impl;
205   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
206   CeedQFunction qf;
207   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
208   CeedInt Q, numelements, numinputfields, numoutputfields;
209   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
210   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
211   CeedChkBackend(ierr);
212   CeedOperatorField *opinputfields, *opoutputfields;
213   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
214                                &numoutputfields, &opoutputfields);
215   CeedChkBackend(ierr);
216   CeedQFunctionField *qfinputfields, *qfoutputfields;
217   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
218   CeedChkBackend(ierr);
219 
220   // Allocate
221   ierr = CeedCalloc(numinputfields + numoutputfields, &impl->evecs);
222   CeedChkBackend(ierr);
223 
224   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin); CeedChkBackend(ierr);
225   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout); CeedChkBackend(ierr);
226 
227   impl->numein = numinputfields; impl->numeout = numoutputfields;
228 
229   // Set up infield and outfield evecs and qvecs
230   // Infields
231   ierr = CeedOperatorSetupFields_Cuda(qf, op, true,
232                                       impl->evecs, impl->qvecsin, 0,
233                                       numinputfields, Q, numelements);
234   CeedChkBackend(ierr);
235 
236   // Outfields
237   ierr = CeedOperatorSetupFields_Cuda(qf, op, false,
238                                       impl->evecs, impl->qvecsout,
239                                       numinputfields, numoutputfields, Q,
240                                       numelements); CeedChkBackend(ierr);
241 
242   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
243   return CEED_ERROR_SUCCESS;
244 }
245 
246 //------------------------------------------------------------------------------
247 // Setup Operator Inputs
248 //------------------------------------------------------------------------------
249 static inline int CeedOperatorSetupInputs_Cuda(CeedInt numinputfields,
250     CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
251     CeedVector invec, const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX],
252     CeedOperator_Cuda *impl, CeedRequest *request) {
253   CeedInt ierr;
254   CeedEvalMode emode;
255   CeedVector vec;
256   CeedElemRestriction Erestrict;
257 
258   for (CeedInt i = 0; i < numinputfields; i++) {
259     // Get input vector
260     ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
261     if (vec == CEED_VECTOR_ACTIVE) {
262       if (skipactive)
263         continue;
264       else
265         vec = invec;
266     }
267 
268     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
269     CeedChkBackend(ierr);
270     if (emode == CEED_EVAL_WEIGHT) { // Skip
271     } else {
272       // Get input vector
273       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
274       // Get input element restriction
275       ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
276       CeedChkBackend(ierr);
277       if (vec == CEED_VECTOR_ACTIVE)
278         vec = invec;
279       // Restrict, if necessary
280       if (!impl->evecs[i]) {
281         // No restriction for this field; read data directly from vec.
282         ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE,
283                                       (const CeedScalar **) &edata[i]);
284         CeedChkBackend(ierr);
285       } else {
286         ierr = CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec,
287                                         impl->evecs[i], request); CeedChkBackend(ierr);
288         // Get evec
289         ierr = CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE,
290                                       (const CeedScalar **) &edata[i]);
291         CeedChkBackend(ierr);
292       }
293     }
294   }
295   return CEED_ERROR_SUCCESS;
296 }
297 
298 //------------------------------------------------------------------------------
299 // Input Basis Action
300 //------------------------------------------------------------------------------
301 static inline int CeedOperatorInputBasis_Cuda(CeedInt numelements,
302     CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
303     CeedInt numinputfields, const bool skipactive,
304     CeedScalar *edata[2*CEED_FIELD_MAX],CeedOperator_Cuda *impl) {
305   CeedInt ierr;
306   CeedInt elemsize, size;
307   CeedElemRestriction Erestrict;
308   CeedEvalMode emode;
309   CeedBasis basis;
310 
311   for (CeedInt i=0; i<numinputfields; i++) {
312     // Skip active input
313     if (skipactive) {
314       CeedVector vec;
315       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
316       if (vec == CEED_VECTOR_ACTIVE)
317         continue;
318     }
319     // Get elemsize, emode, size
320     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
321     CeedChkBackend(ierr);
322     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
323     CeedChkBackend(ierr);
324     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
325     CeedChkBackend(ierr);
326     ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr);
327     // Basis action
328     switch (emode) {
329     case CEED_EVAL_NONE:
330       ierr = CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE,
331                                 CEED_USE_POINTER, edata[i]); CeedChkBackend(ierr);
332       break;
333     case CEED_EVAL_INTERP:
334       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis);
335       CeedChkBackend(ierr);
336       ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE,
337                             CEED_EVAL_INTERP, impl->evecs[i],
338                             impl->qvecsin[i]); CeedChkBackend(ierr);
339       break;
340     case CEED_EVAL_GRAD:
341       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis);
342       CeedChkBackend(ierr);
343       ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE,
344                             CEED_EVAL_GRAD, impl->evecs[i],
345                             impl->qvecsin[i]); CeedChkBackend(ierr);
346       break;
347     case CEED_EVAL_WEIGHT:
348       break; // No action
349     case CEED_EVAL_DIV:
350       break; // TODO: Not implemented
351     case CEED_EVAL_CURL:
352       break; // TODO: Not implemented
353     }
354   }
355   return CEED_ERROR_SUCCESS;
356 }
357 
358 //------------------------------------------------------------------------------
359 // Restore Input Vectors
360 //------------------------------------------------------------------------------
361 static inline int CeedOperatorRestoreInputs_Cuda(CeedInt numinputfields,
362     CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
363     const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX],
364     CeedOperator_Cuda *impl) {
365   CeedInt ierr;
366   CeedEvalMode emode;
367   CeedVector vec;
368 
369   for (CeedInt i = 0; i < numinputfields; i++) {
370     // Skip active input
371     if (skipactive) {
372       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
373       if (vec == CEED_VECTOR_ACTIVE)
374         continue;
375     }
376     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
377     CeedChkBackend(ierr);
378     if (emode == CEED_EVAL_WEIGHT) { // Skip
379     } else {
380       if (!impl->evecs[i]) {  // This was a skiprestrict case
381         ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
382         ierr = CeedVectorRestoreArrayRead(vec,
383                                           (const CeedScalar **)&edata[i]);
384         CeedChkBackend(ierr);
385       } else {
386         ierr = CeedVectorRestoreArrayRead(impl->evecs[i],
387                                           (const CeedScalar **) &edata[i]);
388         CeedChkBackend(ierr);
389       }
390     }
391   }
392   return CEED_ERROR_SUCCESS;
393 }
394 
395 //------------------------------------------------------------------------------
396 // Apply and add to output
397 //------------------------------------------------------------------------------
398 static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector invec,
399                                      CeedVector outvec, CeedRequest *request) {
400   int ierr;
401   CeedOperator_Cuda *impl;
402   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
403   CeedQFunction qf;
404   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
405   CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size;
406   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
407   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
408   CeedChkBackend(ierr);
409   CeedOperatorField *opinputfields, *opoutputfields;
410   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
411                                &numoutputfields, &opoutputfields);
412   CeedChkBackend(ierr);
413   CeedQFunctionField *qfinputfields, *qfoutputfields;
414   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
415   CeedChkBackend(ierr);
416   CeedEvalMode emode;
417   CeedVector vec;
418   CeedBasis basis;
419   CeedElemRestriction Erestrict;
420   CeedScalar *edata[2*CEED_FIELD_MAX] = {0};
421 
422   // Setup
423   ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr);
424 
425   // Input Evecs and Restriction
426   ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields,
427                                       opinputfields, invec, false, edata,
428                                       impl, request); CeedChkBackend(ierr);
429 
430   // Input basis apply if needed
431   ierr = CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields,
432                                      numinputfields, false, edata, impl);
433   CeedChkBackend(ierr);
434 
435   // Output pointers, as necessary
436   for (CeedInt i = 0; i < numoutputfields; i++) {
437     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
438     CeedChkBackend(ierr);
439     if (emode == CEED_EVAL_NONE) {
440       // Set the output Q-Vector to use the E-Vector data directly.
441       ierr = CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE,
442                                      &edata[i + numinputfields]); CeedChkBackend(ierr);
443       ierr = CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE,
444                                 CEED_USE_POINTER, edata[i + numinputfields]);
445       CeedChkBackend(ierr);
446     }
447   }
448 
449   // Q function
450   ierr = CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout);
451   CeedChkBackend(ierr);
452 
453   // Output basis apply if needed
454   for (CeedInt i = 0; i < numoutputfields; i++) {
455     // Get elemsize, emode, size
456     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
457     CeedChkBackend(ierr);
458     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
459     CeedChkBackend(ierr);
460     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
461     CeedChkBackend(ierr);
462     ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size);
463     CeedChkBackend(ierr);
464     // Basis action
465     switch (emode) {
466     case CEED_EVAL_NONE:
467       break;
468     case CEED_EVAL_INTERP:
469       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
470       CeedChkBackend(ierr);
471       ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE,
472                             CEED_EVAL_INTERP, impl->qvecsout[i],
473                             impl->evecs[i + impl->numein]); CeedChkBackend(ierr);
474       break;
475     case CEED_EVAL_GRAD:
476       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
477       CeedChkBackend(ierr);
478       ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE,
479                             CEED_EVAL_GRAD, impl->qvecsout[i],
480                             impl->evecs[i + impl->numein]); CeedChkBackend(ierr);
481       break;
482     // LCOV_EXCL_START
483     case CEED_EVAL_WEIGHT: {
484       Ceed ceed;
485       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
486       return CeedError(ceed, CEED_ERROR_BACKEND,
487                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
488       break; // Should not occur
489     }
490     case CEED_EVAL_DIV:
491       break; // TODO: Not implemented
492     case CEED_EVAL_CURL:
493       break; // TODO: Not implemented
494       // LCOV_EXCL_STOP
495     }
496   }
497 
498   // Output restriction
499   for (CeedInt i = 0; i < numoutputfields; i++) {
500     // Restore evec
501     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
502     CeedChkBackend(ierr);
503     if (emode == CEED_EVAL_NONE) {
504       ierr = CeedVectorRestoreArray(impl->evecs[i+impl->numein],
505                                     &edata[i + numinputfields]);
506       CeedChkBackend(ierr);
507     }
508     // Get output vector
509     ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec);
510     CeedChkBackend(ierr);
511     // Restrict
512     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
513     CeedChkBackend(ierr);
514     // Active
515     if (vec == CEED_VECTOR_ACTIVE)
516       vec = outvec;
517 
518     ierr = CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE,
519                                     impl->evecs[i + impl->numein], vec,
520                                     request); CeedChkBackend(ierr);
521   }
522 
523   // Restore input arrays
524   ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields,
525                                         opinputfields, false, edata, impl);
526   CeedChkBackend(ierr);
527   return CEED_ERROR_SUCCESS;
528 }
529 
530 //------------------------------------------------------------------------------
531 // Core code for assembling linear QFunction
532 //------------------------------------------------------------------------------
533 static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
534     bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
535     CeedRequest *request) {
536   int ierr;
537   CeedOperator_Cuda *impl;
538   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
539   CeedQFunction qf;
540   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
541   CeedInt Q, numelements, numinputfields, numoutputfields, size;
542   CeedSize q_size;
543   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
544   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
545   CeedChkBackend(ierr);
546   CeedOperatorField *opinputfields, *opoutputfields;
547   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
548                                &numoutputfields, &opoutputfields);
549   CeedChkBackend(ierr);
550   CeedQFunctionField *qfinputfields, *qfoutputfields;
551   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
552   CeedChkBackend(ierr);
553   CeedVector vec;
554   CeedInt numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout;
555   CeedVector *activein = impl->qfactivein;
556   CeedScalar *a, *tmp;
557   Ceed ceed, ceedparent;
558   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
559   ierr = CeedGetOperatorFallbackParentCeed(ceed, &ceedparent);
560   CeedChkBackend(ierr);
561   ceedparent = ceedparent ? ceedparent : ceed;
562   CeedScalar *edata[2*CEED_FIELD_MAX];
563 
564   // Setup
565   ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr);
566 
567   // Check for identity
568   bool identityqf;
569   ierr = CeedQFunctionIsIdentity(qf, &identityqf); CeedChkBackend(ierr);
570   if (identityqf)
571     // LCOV_EXCL_START
572     return CeedError(ceed, CEED_ERROR_BACKEND,
573                      "Assembling identity QFunctions not supported");
574   // LCOV_EXCL_STOP
575 
576   // Input Evecs and Restriction
577   ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields,
578                                       opinputfields, NULL, true, edata,
579                                       impl, request); CeedChkBackend(ierr);
580 
581   // Count number of active input fields
582   if (!numactivein) {
583     for (CeedInt i=0; i<numinputfields; i++) {
584       // Get input vector
585       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
586       // Check if active input
587       if (vec == CEED_VECTOR_ACTIVE) {
588         ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr);
589         ierr = CeedVectorSetValue(impl->qvecsin[i], 0.0); CeedChkBackend(ierr);
590         ierr = CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp);
591         CeedChkBackend(ierr);
592         ierr = CeedRealloc(numactivein + size, &activein); CeedChkBackend(ierr);
593         for (CeedInt field = 0; field < size; field++) {
594           q_size = (CeedSize)Q*numelements;
595           ierr = CeedVectorCreate(ceed, q_size, &activein[numactivein+field]);
596           CeedChkBackend(ierr);
597           ierr = CeedVectorSetArray(activein[numactivein+field], CEED_MEM_DEVICE,
598                                     CEED_USE_POINTER, &tmp[field*Q*numelements]);
599           CeedChkBackend(ierr);
600         }
601         numactivein += size;
602         ierr = CeedVectorRestoreArray(impl->qvecsin[i], &tmp); CeedChkBackend(ierr);
603       }
604     }
605     impl->qfnumactivein = numactivein;
606     impl->qfactivein = activein;
607   }
608 
609   // Count number of active output fields
610   if (!numactiveout) {
611     for (CeedInt i=0; i<numoutputfields; i++) {
612       // Get output vector
613       ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec);
614       CeedChkBackend(ierr);
615       // Check if active output
616       if (vec == CEED_VECTOR_ACTIVE) {
617         ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size);
618         CeedChkBackend(ierr);
619         numactiveout += size;
620       }
621     }
622     impl->qfnumactiveout = numactiveout;
623   }
624 
625   // Check sizes
626   if (!numactivein || !numactiveout)
627     // LCOV_EXCL_START
628     return CeedError(ceed, CEED_ERROR_BACKEND,
629                      "Cannot assemble QFunction without active inputs "
630                      "and outputs");
631   // LCOV_EXCL_STOP
632 
633   // Build objects if needed
634   if (build_objects) {
635     // Create output restriction
636     CeedInt strides[3] = {1, numelements*Q, Q}; /* *NOPAD* */
637     ierr = CeedElemRestrictionCreateStrided(ceedparent, numelements, Q,
638                                             numactivein*numactiveout,
639                                             numactivein*numactiveout*numelements*Q,
640                                             strides, rstr); CeedChkBackend(ierr);
641     // Create assembled vector
642     CeedSize l_size = (CeedSize)numelements*Q*numactivein*numactiveout;
643     ierr = CeedVectorCreate(ceedparent, l_size, assembled); CeedChkBackend(ierr);
644   }
645   ierr = CeedVectorSetValue(*assembled, 0.0); CeedChkBackend(ierr);
646   ierr = CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &a);
647   CeedChkBackend(ierr);
648 
649   // Input basis apply
650   ierr = CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields,
651                                      numinputfields, true, edata, impl);
652   CeedChkBackend(ierr);
653 
654   // Assemble QFunction
655   for (CeedInt in=0; in<numactivein; in++) {
656     // Set Inputs
657     ierr = CeedVectorSetValue(activein[in], 1.0); CeedChkBackend(ierr);
658     if (numactivein > 1) {
659       ierr = CeedVectorSetValue(activein[(in+numactivein-1)%numactivein],
660                                 0.0); CeedChkBackend(ierr);
661     }
662     // Set Outputs
663     for (CeedInt out=0; out<numoutputfields; out++) {
664       // Get output vector
665       ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec);
666       CeedChkBackend(ierr);
667       // Check if active output
668       if (vec == CEED_VECTOR_ACTIVE) {
669         CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE,
670                            CEED_USE_POINTER, a); CeedChkBackend(ierr);
671         ierr = CeedQFunctionFieldGetSize(qfoutputfields[out], &size);
672         CeedChkBackend(ierr);
673         a += size*Q*numelements; // Advance the pointer by the size of the output
674       }
675     }
676     // Apply QFunction
677     ierr = CeedQFunctionApply(qf, Q*numelements, impl->qvecsin, impl->qvecsout);
678     CeedChkBackend(ierr);
679   }
680 
681   // Un-set output Qvecs to prevent accidental overwrite of Assembled
682   for (CeedInt out=0; out<numoutputfields; out++) {
683     // Get output vector
684     ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec);
685     CeedChkBackend(ierr);
686     // Check if active output
687     if (vec == CEED_VECTOR_ACTIVE) {
688       ierr = CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL);
689       CeedChkBackend(ierr);
690     }
691   }
692 
693   // Restore input arrays
694   ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields,
695                                         opinputfields, true, edata, impl);
696   CeedChkBackend(ierr);
697 
698   // Restore output
699   ierr = CeedVectorRestoreArray(*assembled, &a); CeedChkBackend(ierr);
700 
701   return CEED_ERROR_SUCCESS;
702 }
703 
704 //------------------------------------------------------------------------------
705 // Assemble Linear QFunction
706 //------------------------------------------------------------------------------
707 static int CeedOperatorLinearAssembleQFunction_Cuda(CeedOperator op,
708     CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
709   return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, true, assembled, rstr,
710          request);
711 }
712 
713 //------------------------------------------------------------------------------
714 // Update Assembled Linear QFunction
715 //------------------------------------------------------------------------------
716 static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op,
717     CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
718   return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, false, &assembled,
719          &rstr, request);
720 }
721 
722 //------------------------------------------------------------------------------
723 // Diagonal assembly kernels
724 //------------------------------------------------------------------------------
725 // *INDENT-OFF*
726 static const char *diagonalkernels = QUOTE(
727 
728 typedef enum {
729   /// Perform no evaluation (either because there is no data or it is already at
730   /// quadrature points)
731   CEED_EVAL_NONE   = 0,
732   /// Interpolate from nodes to quadrature points
733   CEED_EVAL_INTERP = 1,
734   /// Evaluate gradients at quadrature points from input in a nodal basis
735   CEED_EVAL_GRAD   = 2,
736   /// Evaluate divergence at quadrature points from input in a nodal basis
737   CEED_EVAL_DIV    = 4,
738   /// Evaluate curl at quadrature points from input in a nodal basis
739   CEED_EVAL_CURL   = 8,
740   /// Using no input, evaluate quadrature weights on the reference element
741   CEED_EVAL_WEIGHT = 16,
742 } CeedEvalMode;
743 
744 //------------------------------------------------------------------------------
745 // Get Basis Emode Pointer
746 //------------------------------------------------------------------------------
747 extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **basisptr,
748     CeedEvalMode emode, const CeedScalar *identity, const CeedScalar *interp,
749     const CeedScalar *grad) {
750   switch (emode) {
751   case CEED_EVAL_NONE:
752     *basisptr = identity;
753     break;
754   case CEED_EVAL_INTERP:
755     *basisptr = interp;
756     break;
757   case CEED_EVAL_GRAD:
758     *basisptr = grad;
759     break;
760   case CEED_EVAL_WEIGHT:
761   case CEED_EVAL_DIV:
762   case CEED_EVAL_CURL:
763     break; // Caught by QF Assembly
764   }
765 }
766 
767 //------------------------------------------------------------------------------
768 // Core code for diagonal assembly
769 //------------------------------------------------------------------------------
770 __device__ void diagonalCore(const CeedInt nelem,
771     const CeedScalar maxnorm, const bool pointBlock,
772     const CeedScalar *identity,
773     const CeedScalar *interpin, const CeedScalar *gradin,
774     const CeedScalar *interpout, const CeedScalar *gradout,
775     const CeedEvalMode *emodein, const CeedEvalMode *emodeout,
776     const CeedScalar *__restrict__ assembledqfarray,
777     CeedScalar *__restrict__ elemdiagarray) {
778   const int tid = threadIdx.x; // running with P threads, tid is evec node
779   const CeedScalar qfvaluebound = maxnorm*1e-12;
780 
781   // Compute the diagonal of B^T D B
782   // Each element
783   for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < nelem;
784        e += gridDim.x*blockDim.z) {
785     CeedInt dout = -1;
786     // Each basis eval mode pair
787     for (CeedInt eout = 0; eout < NUMEMODEOUT; eout++) {
788       const CeedScalar *bt = NULL;
789       if (emodeout[eout] == CEED_EVAL_GRAD)
790         dout += 1;
791       CeedOperatorGetBasisPointer_Cuda(&bt, emodeout[eout], identity, interpout,
792                                       &gradout[dout*NQPTS*NNODES]);
793       CeedInt din = -1;
794       for (CeedInt ein = 0; ein < NUMEMODEIN; ein++) {
795         const CeedScalar *b = NULL;
796         if (emodein[ein] == CEED_EVAL_GRAD)
797           din += 1;
798         CeedOperatorGetBasisPointer_Cuda(&b, emodein[ein], identity, interpin,
799                                         &gradin[din*NQPTS*NNODES]);
800         // Each component
801         for (CeedInt compOut = 0; compOut < NCOMP; compOut++) {
802           // Each qpoint/node pair
803           if (pointBlock) {
804             // Point Block Diagonal
805             for (CeedInt compIn = 0; compIn < NCOMP; compIn++) {
806               CeedScalar evalue = 0.;
807               for (CeedInt q = 0; q < NQPTS; q++) {
808                 const CeedScalar qfvalue =
809                   assembledqfarray[((((ein*NCOMP+compIn)*NUMEMODEOUT+eout)*
810                                      NCOMP+compOut)*nelem+e)*NQPTS+q];
811                 if (abs(qfvalue) > qfvaluebound)
812                   evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid];
813               }
814               elemdiagarray[((compOut*NCOMP+compIn)*nelem+e)*NNODES+tid] += evalue;
815             }
816           } else {
817             // Diagonal Only
818             CeedScalar evalue = 0.;
819             for (CeedInt q = 0; q < NQPTS; q++) {
820               const CeedScalar qfvalue =
821                 assembledqfarray[((((ein*NCOMP+compOut)*NUMEMODEOUT+eout)*
822                                    NCOMP+compOut)*nelem+e)*NQPTS+q];
823               if (abs(qfvalue) > qfvaluebound)
824                 evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid];
825             }
826             elemdiagarray[(compOut*nelem+e)*NNODES+tid] += evalue;
827           }
828         }
829       }
830     }
831   }
832 }
833 
834 //------------------------------------------------------------------------------
835 // Linear diagonal
836 //------------------------------------------------------------------------------
837 extern "C" __global__ void linearDiagonal(const CeedInt nelem,
838     const CeedScalar maxnorm, const CeedScalar *identity,
839     const CeedScalar *interpin, const CeedScalar *gradin,
840     const CeedScalar *interpout, const CeedScalar *gradout,
841     const CeedEvalMode *emodein, const CeedEvalMode *emodeout,
842     const CeedScalar *__restrict__ assembledqfarray,
843     CeedScalar *__restrict__ elemdiagarray) {
844   diagonalCore(nelem, maxnorm, false, identity, interpin, gradin, interpout,
845                gradout, emodein, emodeout, assembledqfarray, elemdiagarray);
846 }
847 
848 //------------------------------------------------------------------------------
849 // Linear point block diagonal
850 //------------------------------------------------------------------------------
851 extern "C" __global__ void linearPointBlockDiagonal(const CeedInt nelem,
852     const CeedScalar maxnorm, const CeedScalar *identity,
853     const CeedScalar *interpin, const CeedScalar *gradin,
854     const CeedScalar *interpout, const CeedScalar *gradout,
855     const CeedEvalMode *emodein, const CeedEvalMode *emodeout,
856     const CeedScalar *__restrict__ assembledqfarray,
857     CeedScalar *__restrict__ elemdiagarray) {
858   diagonalCore(nelem, maxnorm, true, identity, interpin, gradin, interpout,
859                gradout, emodein, emodeout, assembledqfarray, elemdiagarray);
860 }
861 
862 );
863 // *INDENT-ON*
864 
865 //------------------------------------------------------------------------------
866 // Create point block restriction
867 //------------------------------------------------------------------------------
868 static int CreatePBRestriction(CeedElemRestriction rstr,
869                                CeedElemRestriction *pbRstr) {
870   int ierr;
871   Ceed ceed;
872   ierr = CeedElemRestrictionGetCeed(rstr, &ceed); CeedChkBackend(ierr);
873   const CeedInt *offsets;
874   ierr = CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets);
875   CeedChkBackend(ierr);
876 
877   // Expand offsets
878   CeedInt nelem, ncomp, elemsize, compstride, max = 1, *pbOffsets;
879   ierr = CeedElemRestrictionGetNumElements(rstr, &nelem); CeedChkBackend(ierr);
880   ierr = CeedElemRestrictionGetNumComponents(rstr, &ncomp); CeedChkBackend(ierr);
881   ierr = CeedElemRestrictionGetElementSize(rstr, &elemsize); CeedChkBackend(ierr);
882   ierr = CeedElemRestrictionGetCompStride(rstr, &compstride);
883   CeedChkBackend(ierr);
884   CeedInt shift = ncomp;
885   if (compstride != 1)
886     shift *= ncomp;
887   ierr = CeedCalloc(nelem*elemsize, &pbOffsets); CeedChkBackend(ierr);
888   for (CeedInt i = 0; i < nelem*elemsize; i++) {
889     pbOffsets[i] = offsets[i]*shift;
890     if (pbOffsets[i] > max)
891       max = pbOffsets[i];
892   }
893 
894   // Create new restriction
895   ierr = CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp*ncomp, 1,
896                                    max + ncomp*ncomp, CEED_MEM_HOST,
897                                    CEED_OWN_POINTER, pbOffsets, pbRstr);
898   CeedChkBackend(ierr);
899 
900   // Cleanup
901   ierr = CeedElemRestrictionRestoreOffsets(rstr, &offsets); CeedChkBackend(ierr);
902 
903   return CEED_ERROR_SUCCESS;
904 }
905 
906 //------------------------------------------------------------------------------
907 // Assemble diagonal setup
908 //------------------------------------------------------------------------------
909 static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op,
910     const bool pointBlock) {
911   int ierr;
912   Ceed ceed;
913   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
914   CeedQFunction qf;
915   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
916   CeedInt numinputfields, numoutputfields;
917   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
918   CeedChkBackend(ierr);
919 
920   // Determine active input basis
921   CeedOperatorField *opfields;
922   CeedQFunctionField *qffields;
923   ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL);
924   CeedChkBackend(ierr);
925   ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL);
926   CeedChkBackend(ierr);
927   CeedInt numemodein = 0, ncomp = 0, dim = 1;
928   CeedEvalMode *emodein = NULL;
929   CeedBasis basisin = NULL;
930   CeedElemRestriction rstrin = NULL;
931   for (CeedInt i = 0; i < numinputfields; i++) {
932     CeedVector vec;
933     ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr);
934     if (vec == CEED_VECTOR_ACTIVE) {
935       CeedElemRestriction rstr;
936       ierr = CeedOperatorFieldGetBasis(opfields[i], &basisin); CeedChkBackend(ierr);
937       ierr = CeedBasisGetNumComponents(basisin, &ncomp); CeedChkBackend(ierr);
938       ierr = CeedBasisGetDimension(basisin, &dim); CeedChkBackend(ierr);
939       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr);
940       CeedChkBackend(ierr);
941       if (rstrin && rstrin != rstr)
942         // LCOV_EXCL_START
943         return CeedError(ceed, CEED_ERROR_BACKEND,
944                          "Multi-field non-composite operator diagonal assembly not supported");
945       // LCOV_EXCL_STOP
946       rstrin = rstr;
947       CeedEvalMode emode;
948       ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode);
949       CeedChkBackend(ierr);
950       switch (emode) {
951       case CEED_EVAL_NONE:
952       case CEED_EVAL_INTERP:
953         ierr = CeedRealloc(numemodein + 1, &emodein); CeedChkBackend(ierr);
954         emodein[numemodein] = emode;
955         numemodein += 1;
956         break;
957       case CEED_EVAL_GRAD:
958         ierr = CeedRealloc(numemodein + dim, &emodein); CeedChkBackend(ierr);
959         for (CeedInt d = 0; d < dim; d++)
960           emodein[numemodein+d] = emode;
961         numemodein += dim;
962         break;
963       case CEED_EVAL_WEIGHT:
964       case CEED_EVAL_DIV:
965       case CEED_EVAL_CURL:
966         break; // Caught by QF Assembly
967       }
968     }
969   }
970 
971   // Determine active output basis
972   ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields);
973   CeedChkBackend(ierr);
974   ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields);
975   CeedChkBackend(ierr);
976   CeedInt numemodeout = 0;
977   CeedEvalMode *emodeout = NULL;
978   CeedBasis basisout = NULL;
979   CeedElemRestriction rstrout = NULL;
980   for (CeedInt i = 0; i < numoutputfields; i++) {
981     CeedVector vec;
982     ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr);
983     if (vec == CEED_VECTOR_ACTIVE) {
984       CeedElemRestriction rstr;
985       ierr = CeedOperatorFieldGetBasis(opfields[i], &basisout); CeedChkBackend(ierr);
986       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr);
987       CeedChkBackend(ierr);
988       if (rstrout && rstrout != rstr)
989         // LCOV_EXCL_START
990         return CeedError(ceed, CEED_ERROR_BACKEND,
991                          "Multi-field non-composite operator diagonal assembly not supported");
992       // LCOV_EXCL_STOP
993       rstrout = rstr;
994       CeedEvalMode emode;
995       ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr);
996       switch (emode) {
997       case CEED_EVAL_NONE:
998       case CEED_EVAL_INTERP:
999         ierr = CeedRealloc(numemodeout + 1, &emodeout); CeedChkBackend(ierr);
1000         emodeout[numemodeout] = emode;
1001         numemodeout += 1;
1002         break;
1003       case CEED_EVAL_GRAD:
1004         ierr = CeedRealloc(numemodeout + dim, &emodeout); CeedChkBackend(ierr);
1005         for (CeedInt d = 0; d < dim; d++)
1006           emodeout[numemodeout+d] = emode;
1007         numemodeout += dim;
1008         break;
1009       case CEED_EVAL_WEIGHT:
1010       case CEED_EVAL_DIV:
1011       case CEED_EVAL_CURL:
1012         break; // Caught by QF Assembly
1013       }
1014     }
1015   }
1016 
1017   // Operator data struct
1018   CeedOperator_Cuda *impl;
1019   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1020   ierr = CeedCalloc(1, &impl->diag); CeedChkBackend(ierr);
1021   CeedOperatorDiag_Cuda *diag = impl->diag;
1022   diag->basisin = basisin;
1023   diag->basisout = basisout;
1024   diag->h_emodein = emodein;
1025   diag->h_emodeout = emodeout;
1026   diag->numemodein = numemodein;
1027   diag->numemodeout = numemodeout;
1028 
1029   // Assemble kernel
1030   CeedInt nnodes, nqpts;
1031   ierr = CeedBasisGetNumNodes(basisin, &nnodes); CeedChkBackend(ierr);
1032   ierr = CeedBasisGetNumQuadraturePoints(basisin, &nqpts); CeedChkBackend(ierr);
1033   diag->nnodes = nnodes;
1034   ierr = CeedCompileCuda(ceed, diagonalkernels, &diag->module, 5,
1035                          "NUMEMODEIN", numemodein,
1036                          "NUMEMODEOUT", numemodeout,
1037                          "NNODES", nnodes,
1038                          "NQPTS", nqpts,
1039                          "NCOMP", ncomp
1040                         ); CeedChk_Cu(ceed, ierr);
1041   ierr = CeedGetKernelCuda(ceed, diag->module, "linearDiagonal",
1042                            &diag->linearDiagonal); CeedChk_Cu(ceed, ierr);
1043   ierr = CeedGetKernelCuda(ceed, diag->module, "linearPointBlockDiagonal",
1044                            &diag->linearPointBlock);
1045   CeedChk_Cu(ceed, ierr);
1046 
1047   // Basis matrices
1048   const CeedInt qBytes = nqpts * sizeof(CeedScalar);
1049   const CeedInt iBytes = qBytes * nnodes;
1050   const CeedInt gBytes = qBytes * nnodes * dim;
1051   const CeedInt eBytes = sizeof(CeedEvalMode);
1052   const CeedScalar *interpin, *interpout, *gradin, *gradout;
1053 
1054   // CEED_EVAL_NONE
1055   CeedScalar *identity = NULL;
1056   bool evalNone = false;
1057   for (CeedInt i=0; i<numemodein; i++)
1058     evalNone = evalNone || (emodein[i] == CEED_EVAL_NONE);
1059   for (CeedInt i=0; i<numemodeout; i++)
1060     evalNone = evalNone || (emodeout[i] == CEED_EVAL_NONE);
1061   if (evalNone) {
1062     ierr = CeedCalloc(nqpts*nnodes, &identity); CeedChkBackend(ierr);
1063     for (CeedInt i=0; i<(nnodes<nqpts?nnodes:nqpts); i++)
1064       identity[i*nnodes+i] = 1.0;
1065     ierr = cudaMalloc((void **)&diag->d_identity, iBytes); CeedChk_Cu(ceed, ierr);
1066     ierr = cudaMemcpy(diag->d_identity, identity, iBytes,
1067                       cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1068   }
1069 
1070   // CEED_EVAL_INTERP
1071   ierr = CeedBasisGetInterp(basisin, &interpin); CeedChkBackend(ierr);
1072   ierr = cudaMalloc((void **)&diag->d_interpin, iBytes); CeedChk_Cu(ceed, ierr);
1073   ierr = cudaMemcpy(diag->d_interpin, interpin, iBytes,
1074                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1075   ierr = CeedBasisGetInterp(basisout, &interpout); CeedChkBackend(ierr);
1076   ierr = cudaMalloc((void **)&diag->d_interpout, iBytes); CeedChk_Cu(ceed, ierr);
1077   ierr = cudaMemcpy(diag->d_interpout, interpout, iBytes,
1078                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1079 
1080   // CEED_EVAL_GRAD
1081   ierr = CeedBasisGetGrad(basisin, &gradin); CeedChkBackend(ierr);
1082   ierr = cudaMalloc((void **)&diag->d_gradin, gBytes); CeedChk_Cu(ceed, ierr);
1083   ierr = cudaMemcpy(diag->d_gradin, gradin, gBytes,
1084                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1085   ierr = CeedBasisGetGrad(basisout, &gradout); CeedChkBackend(ierr);
1086   ierr = cudaMalloc((void **)&diag->d_gradout, gBytes); CeedChk_Cu(ceed, ierr);
1087   ierr = cudaMemcpy(diag->d_gradout, gradout, gBytes,
1088                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1089 
1090   // Arrays of emodes
1091   ierr = cudaMalloc((void **)&diag->d_emodein, numemodein * eBytes);
1092   CeedChk_Cu(ceed, ierr);
1093   ierr = cudaMemcpy(diag->d_emodein, emodein, numemodein * eBytes,
1094                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1095   ierr = cudaMalloc((void **)&diag->d_emodeout, numemodeout * eBytes);
1096   CeedChk_Cu(ceed, ierr);
1097   ierr = cudaMemcpy(diag->d_emodeout, emodeout, numemodeout * eBytes,
1098                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1099 
1100   // Restriction
1101   diag->diagrstr = rstrout;
1102 
1103   return CEED_ERROR_SUCCESS;
1104 }
1105 
1106 //------------------------------------------------------------------------------
1107 // Assemble diagonal common code
1108 //------------------------------------------------------------------------------
1109 static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op,
1110     CeedVector assembled, CeedRequest *request, const bool pointBlock) {
1111   int ierr;
1112   Ceed ceed;
1113   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1114   CeedOperator_Cuda *impl;
1115   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1116 
1117   // Assemble QFunction
1118   CeedVector assembledqf;
1119   CeedElemRestriction rstr;
1120   ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf,
1121          &rstr, request); CeedChkBackend(ierr);
1122   ierr = CeedElemRestrictionDestroy(&rstr); CeedChkBackend(ierr);
1123   CeedScalar maxnorm = 0;
1124   ierr = CeedVectorNorm(assembledqf, CEED_NORM_MAX, &maxnorm);
1125   CeedChkBackend(ierr);
1126 
1127   // Setup
1128   if (!impl->diag) {
1129     ierr = CeedOperatorAssembleDiagonalSetup_Cuda(op, pointBlock);
1130     CeedChkBackend(ierr);
1131   }
1132   CeedOperatorDiag_Cuda *diag = impl->diag;
1133   assert(diag != NULL);
1134 
1135   // Restriction
1136   if (pointBlock && !diag->pbdiagrstr) {
1137     CeedElemRestriction pbdiagrstr;
1138     ierr = CreatePBRestriction(diag->diagrstr, &pbdiagrstr); CeedChkBackend(ierr);
1139     diag->pbdiagrstr = pbdiagrstr;
1140   }
1141   CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr;
1142 
1143   // Create diagonal vector
1144   CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag;
1145   if (!elemdiag) {
1146     ierr = CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag);
1147     CeedChkBackend(ierr);
1148     if (pointBlock)
1149       diag->pbelemdiag = elemdiag;
1150     else
1151       diag->elemdiag = elemdiag;
1152   }
1153   ierr = CeedVectorSetValue(elemdiag, 0.0); CeedChkBackend(ierr);
1154 
1155   // Assemble element operator diagonals
1156   CeedScalar *elemdiagarray;
1157   const CeedScalar *assembledqfarray;
1158   ierr = CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray);
1159   CeedChkBackend(ierr);
1160   ierr = CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray);
1161   CeedChkBackend(ierr);
1162   CeedInt nelem;
1163   ierr = CeedElemRestrictionGetNumElements(diagrstr, &nelem);
1164   CeedChkBackend(ierr);
1165 
1166   // Compute the diagonal of B^T D B
1167   int elemsPerBlock = 1;
1168   int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0);
1169   void *args[] = {(void *) &nelem, (void *) &maxnorm, &diag->d_identity,
1170                   &diag->d_interpin, &diag->d_gradin, &diag->d_interpout,
1171                   &diag->d_gradout, &diag->d_emodein, &diag->d_emodeout,
1172                   &assembledqfarray, &elemdiagarray
1173                  };
1174   if (pointBlock) {
1175     ierr = CeedRunKernelDimCuda(ceed, diag->linearPointBlock, grid,
1176                                 diag->nnodes, 1, elemsPerBlock, args);
1177     CeedChkBackend(ierr);
1178   } else {
1179     ierr = CeedRunKernelDimCuda(ceed, diag->linearDiagonal, grid,
1180                                 diag->nnodes, 1, elemsPerBlock, args);
1181     CeedChkBackend(ierr);
1182   }
1183 
1184   // Restore arrays
1185   ierr = CeedVectorRestoreArray(elemdiag, &elemdiagarray); CeedChkBackend(ierr);
1186   ierr = CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray);
1187   CeedChkBackend(ierr);
1188 
1189   // Assemble local operator diagonal
1190   ierr = CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag,
1191                                   assembled, request); CeedChkBackend(ierr);
1192 
1193   // Cleanup
1194   ierr = CeedVectorDestroy(&assembledqf); CeedChkBackend(ierr);
1195 
1196   return CEED_ERROR_SUCCESS;
1197 }
1198 
1199 //------------------------------------------------------------------------------
1200 // Assemble composite diagonal common code
1201 //------------------------------------------------------------------------------
1202 static inline int CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda(
1203   CeedOperator op, CeedVector assembled, CeedRequest *request,
1204   const bool pointBlock) {
1205   int ierr;
1206   CeedInt numSub;
1207   CeedOperator *subOperators;
1208   ierr = CeedOperatorGetNumSub(op, &numSub); CeedChkBackend(ierr);
1209   ierr = CeedOperatorGetSubList(op, &subOperators); CeedChkBackend(ierr);
1210   for (CeedInt i = 0; i < numSub; i++) {
1211     ierr = CeedOperatorAssembleDiagonalCore_Cuda(subOperators[i], assembled,
1212            request, pointBlock); CeedChkBackend(ierr);
1213   }
1214   return CEED_ERROR_SUCCESS;
1215 }
1216 
1217 //------------------------------------------------------------------------------
1218 // Assemble Linear Diagonal
1219 //------------------------------------------------------------------------------
1220 static int CeedOperatorLinearAssembleAddDiagonal_Cuda(CeedOperator op,
1221     CeedVector assembled, CeedRequest *request) {
1222   int ierr;
1223   bool isComposite;
1224   ierr = CeedOperatorIsComposite(op, &isComposite); CeedChkBackend(ierr);
1225   if (isComposite) {
1226     return CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda(op, assembled,
1227            request, false);
1228   } else {
1229     return CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, false);
1230   }
1231 }
1232 
1233 //------------------------------------------------------------------------------
1234 // Assemble Linear Point Block Diagonal
1235 //------------------------------------------------------------------------------
1236 static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op,
1237     CeedVector assembled, CeedRequest *request) {
1238   int ierr;
1239   bool isComposite;
1240   ierr = CeedOperatorIsComposite(op, &isComposite); CeedChkBackend(ierr);
1241   if (isComposite) {
1242     return CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda(op, assembled,
1243            request, true);
1244   } else {
1245     return CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, true);
1246   }
1247 }
1248 
1249 //------------------------------------------------------------------------------
1250 // Matrix assembly kernel for low-order elements (2D thread block)
1251 //------------------------------------------------------------------------------
1252 // *INDENT-OFF*
1253 static const char *assemblykernel = QUOTE(
1254 extern "C" __launch_bounds__(BLOCK_SIZE)
1255            __global__ void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out,
1256                    const CeedScalar *__restrict__ qf_array,
1257                    CeedScalar *__restrict__ values_array) {
1258 
1259   // This kernel assumes B_in and B_out have the same number of quadrature points and
1260   // basis points.
1261   // TODO: expand to more general cases
1262   const int i = threadIdx.x; // The output row index of each B^TDB operation
1263   const int l = threadIdx.y; // The output column index of each B^TDB operation
1264 			     // such that we have (Bout^T)_ij D_jk Bin_kl = C_il
1265 
1266   // Strides for final output ordering, determined by the reference (interface) implementation of
1267   // the symbolic assembly, slowest --> fastest: element, comp_in, comp_out, node_row, node_col
1268   const CeedInt comp_out_stride = NNODES * NNODES;
1269   const CeedInt comp_in_stride = comp_out_stride * NCOMP;
1270   const CeedInt e_stride = comp_in_stride * NCOMP;
1271   // Strides for QF array, slowest --> fastest:  emode_in, comp_in, emode_out, comp_out, elem, qpt
1272   const CeedInt qe_stride = NQPTS;
1273   const CeedInt qcomp_out_stride = NELEM * qe_stride;
1274   const CeedInt qemode_out_stride = qcomp_out_stride * NCOMP;
1275   const CeedInt qcomp_in_stride = qemode_out_stride * NUMEMODEOUT;
1276   const CeedInt qemode_in_stride = qcomp_in_stride * NCOMP;
1277 
1278   // Loop over each element (if necessary)
1279   for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < NELEM;
1280          e += gridDim.x*blockDim.z) {
1281     for (CeedInt comp_in = 0; comp_in < NCOMP; comp_in++) {
1282       for (CeedInt comp_out = 0; comp_out < NCOMP; comp_out++) {
1283         CeedScalar result = 0.0;
1284         CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e;
1285         for (CeedInt emode_in = 0; emode_in < NUMEMODEIN; emode_in++) {
1286           CeedInt b_in_index = emode_in * NQPTS * NNODES;
1287       	  for (CeedInt emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) {
1288              CeedInt b_out_index = emode_out * NQPTS * NNODES;
1289              CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in;
1290  	     // Perform the B^T D B operation for this 'chunk' of D (the qf_array)
1291             for (CeedInt j = 0; j < NQPTS; j++) {
1292      	      result += B_out[b_out_index + j * NNODES  + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l];
1293 	    }
1294 
1295           }// end of emode_out
1296         } // end of emode_in
1297         CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l;
1298    	values_array[val_index] = result;
1299       } // end of out component
1300     } // end of in component
1301   } // end of element loop
1302 }
1303 );
1304 
1305 //------------------------------------------------------------------------------
1306 // Fallback kernel for larger orders (1D thread block)
1307 //------------------------------------------------------------------------------
1308 static const char *assemblykernelbigelem = QUOTE(
1309 extern "C" __launch_bounds__(BLOCK_SIZE)
1310            __global__ void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out,
1311                    const CeedScalar *__restrict__ qf_array,
1312                    CeedScalar *__restrict__ values_array) {
1313 
1314   // This kernel assumes B_in and B_out have the same number of quadrature points and
1315   // basis points.
1316   // TODO: expand to more general cases
1317   const int l = threadIdx.x; // The output column index of each B^TDB operation
1318 			     // such that we have (Bout^T)_ij D_jk Bin_kl = C_il
1319 
1320   // Strides for final output ordering, determined by the reference (interface) implementation of
1321   // the symbolic assembly, slowest --> fastest: element, comp_in, comp_out, node_row, node_col
1322   const CeedInt comp_out_stride = NNODES * NNODES;
1323   const CeedInt comp_in_stride = comp_out_stride * NCOMP;
1324   const CeedInt e_stride = comp_in_stride * NCOMP;
1325   // Strides for QF array, slowest --> fastest:  emode_in, comp_in, emode_out, comp_out, elem, qpt
1326   const CeedInt qe_stride = NQPTS;
1327   const CeedInt qcomp_out_stride = NELEM * qe_stride;
1328   const CeedInt qemode_out_stride = qcomp_out_stride * NCOMP;
1329   const CeedInt qcomp_in_stride = qemode_out_stride * NUMEMODEOUT;
1330   const CeedInt qemode_in_stride = qcomp_in_stride * NCOMP;
1331 
1332     // Loop over each element (if necessary)
1333   for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < NELEM;
1334          e += gridDim.x*blockDim.z) {
1335     for (CeedInt comp_in = 0; comp_in < NCOMP; comp_in++) {
1336       for (CeedInt comp_out = 0; comp_out < NCOMP; comp_out++) {
1337         for (CeedInt i = 0; i < NNODES; i++) {
1338           CeedScalar result = 0.0;
1339           CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e;
1340           for (CeedInt emode_in = 0; emode_in < NUMEMODEIN; emode_in++) {
1341             CeedInt b_in_index = emode_in * NQPTS * NNODES;
1342         	  for (CeedInt emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) {
1343                CeedInt b_out_index = emode_out * NQPTS * NNODES;
1344                CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in;
1345    	     // Perform the B^T D B operation for this 'chunk' of D (the qf_array)
1346               for (CeedInt j = 0; j < NQPTS; j++) {
1347        	      result += B_out[b_out_index + j * NNODES  + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l];
1348   	    }
1349 
1350             }// end of emode_out
1351           } // end of emode_in
1352           CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l;
1353      	  values_array[val_index] = result;
1354         } // end of loop over element node index, i
1355       } // end of out component
1356     } // end of in component
1357   } // end of element loop
1358 }
1359 );
1360 // *INDENT-ON*
1361 
1362 //------------------------------------------------------------------------------
1363 // Single operator assembly setup
1364 //------------------------------------------------------------------------------
1365 static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op) {
1366   int ierr;
1367   Ceed ceed;
1368   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1369   CeedOperator_Cuda *impl;
1370   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1371 
1372   // Get intput and output fields
1373   CeedInt num_input_fields, num_output_fields;
1374   CeedOperatorField *input_fields;
1375   CeedOperatorField *output_fields;
1376   ierr = CeedOperatorGetFields(op, &num_input_fields, &input_fields,
1377                                &num_output_fields, &output_fields); CeedChk(ierr);
1378 
1379   // Determine active input basis eval mode
1380   CeedQFunction qf;
1381   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
1382   CeedQFunctionField *qf_fields;
1383   ierr = CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL); CeedChk(ierr);
1384   // Note that the kernel will treat each dimension of a gradient action separately;
1385   // i.e., when an active input has a CEED_EVAL_GRAD mode, num_emode_in will increment
1386   // by dim.  However, for the purposes of loading the B matrices, it will be treated
1387   // as one mode, and we will load/copy the entire gradient matrix at once, so
1388   // num_B_in_mats_to_load will be incremented by 1.
1389   CeedInt num_emode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0;
1390   CeedEvalMode *eval_mode_in = NULL; //will be of size num_B_in_mats_load
1391   CeedBasis basis_in = NULL;
1392   CeedInt nqpts, esize;
1393   CeedElemRestriction rstr_in = NULL;
1394   for (CeedInt i=0; i<num_input_fields; i++) {
1395     CeedVector vec;
1396     ierr = CeedOperatorFieldGetVector(input_fields[i], &vec); CeedChk(ierr);
1397     if (vec == CEED_VECTOR_ACTIVE) {
1398       ierr = CeedOperatorFieldGetBasis(input_fields[i], &basis_in);
1399       CeedChk(ierr);
1400       ierr = CeedBasisGetDimension(basis_in, &dim); CeedChk(ierr);
1401       ierr = CeedBasisGetNumQuadraturePoints(basis_in, &nqpts); CeedChk(ierr);
1402       ierr = CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in);
1403       ierr = CeedElemRestrictionGetElementSize(rstr_in, &esize); CeedChk(ierr);
1404       CeedChk(ierr);
1405       CeedEvalMode eval_mode;
1406       ierr = CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode);
1407       CeedChk(ierr);
1408       if (eval_mode != CEED_EVAL_NONE) {
1409         ierr = CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in); CeedChk(ierr);
1410         eval_mode_in[num_B_in_mats_to_load] = eval_mode;
1411         num_B_in_mats_to_load += 1;
1412         if (eval_mode == CEED_EVAL_GRAD) {
1413           num_emode_in += dim;
1414           size_B_in += dim * esize * nqpts;
1415         } else {
1416           num_emode_in +=1;
1417           size_B_in += esize * nqpts;
1418         }
1419       }
1420     }
1421   }
1422 
1423   // Determine active output basis; basis_out and rstr_out only used if same as input, TODO
1424   ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields); CeedChk(ierr);
1425   CeedInt num_emode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0;
1426   CeedEvalMode *eval_mode_out = NULL;
1427   CeedBasis basis_out = NULL;
1428   CeedElemRestriction rstr_out = NULL;
1429   for (CeedInt i=0; i<num_output_fields; i++) {
1430     CeedVector vec;
1431     ierr = CeedOperatorFieldGetVector(output_fields[i], &vec); CeedChk(ierr);
1432     if (vec == CEED_VECTOR_ACTIVE) {
1433       ierr = CeedOperatorFieldGetBasis(output_fields[i], &basis_out);
1434       CeedChk(ierr);
1435       ierr = CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out);
1436       CeedChk(ierr);
1437       if (rstr_out && rstr_out != rstr_in)
1438         // LCOV_EXCL_START
1439         return CeedError(ceed, CEED_ERROR_BACKEND,
1440                          "Multi-field non-composite operator assembly not supported");
1441       // LCOV_EXCL_STOP
1442       CeedEvalMode eval_mode;
1443       ierr = CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode);
1444       CeedChk(ierr);
1445       if (eval_mode != CEED_EVAL_NONE) {
1446         ierr = CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out); CeedChk(ierr);
1447         eval_mode_out[num_B_out_mats_to_load] = eval_mode;
1448         num_B_out_mats_to_load += 1;
1449         if (eval_mode == CEED_EVAL_GRAD) {
1450           num_emode_out += dim;
1451           size_B_out += dim * esize * nqpts;
1452         } else {
1453           num_emode_out +=1;
1454           size_B_out += esize * nqpts;
1455         }
1456       }
1457     }
1458   }
1459 
1460   if (num_emode_in == 0 || num_emode_out == 0)
1461     // LCOV_EXCL_START
1462     return CeedError(ceed, CEED_ERROR_UNSUPPORTED,
1463                      "Cannot assemble operator without inputs/outputs");
1464   // LCOV_EXCL_STOP
1465 
1466   CeedInt nelem, ncomp;
1467   ierr = CeedElemRestrictionGetNumElements(rstr_in, &nelem); CeedChk(ierr);
1468   ierr = CeedElemRestrictionGetNumComponents(rstr_in, &ncomp); CeedChk(ierr);
1469 
1470   ierr = CeedCalloc(1, &impl->asmb); CeedChkBackend(ierr);
1471   CeedOperatorAssemble_Cuda *asmb = impl->asmb;
1472   asmb->nelem = nelem;
1473 
1474   // Compile kernels
1475   int elemsPerBlock = 1;
1476   asmb->elemsPerBlock = elemsPerBlock;
1477   CeedInt block_size = esize * esize * elemsPerBlock;
1478   Ceed_Cuda *cuda_data;
1479   ierr = CeedGetData(ceed, &cuda_data); CeedChkBackend(ierr);
1480   if (block_size > cuda_data->device_prop.maxThreadsPerBlock) {
1481     // Use fallback kernel with 1D threadblock
1482     block_size = esize * elemsPerBlock;
1483     asmb->block_size_x = esize;
1484     asmb->block_size_y = 1;
1485     ierr = CeedCompileCuda(ceed, assemblykernelbigelem, &asmb->module, 7,
1486                            "NELEM", nelem,
1487                            "NUMEMODEIN", num_emode_in,
1488                            "NUMEMODEOUT", num_emode_out,
1489                            "NQPTS", nqpts,
1490                            "NNODES", esize,
1491                            "BLOCK_SIZE", block_size,
1492                            "NCOMP", ncomp
1493                           ); CeedChk_Cu(ceed, ierr);
1494   } else {  // Use kernel with 2D threadblock
1495     asmb->block_size_x = esize;
1496     asmb->block_size_y = esize;
1497     ierr = CeedCompileCuda(ceed, assemblykernel, &asmb->module, 7,
1498                            "NELEM", nelem,
1499                            "NUMEMODEIN", num_emode_in,
1500                            "NUMEMODEOUT", num_emode_out,
1501                            "NQPTS", nqpts,
1502                            "NNODES", esize,
1503                            "BLOCK_SIZE", block_size,
1504                            "NCOMP", ncomp
1505                           ); CeedChk_Cu(ceed, ierr);
1506   }
1507   ierr = CeedGetKernelCuda(ceed, asmb->module, "linearAssemble",
1508                            &asmb->linearAssemble); CeedChk_Cu(ceed, ierr);
1509 
1510   // Build 'full' B matrices (not 1D arrays used for tensor-product matrices)
1511   const CeedScalar *interp_in, *grad_in;
1512   ierr = CeedBasisGetInterp(basis_in, &interp_in); CeedChkBackend(ierr);
1513   ierr = CeedBasisGetGrad(basis_in, &grad_in); CeedChkBackend(ierr);
1514 
1515   // Load into B_in, in order that they will be used in eval_mode
1516   const CeedInt inBytes = size_B_in * sizeof(CeedScalar);
1517   CeedInt mat_start = 0;
1518   ierr = cudaMalloc((void **) &asmb->d_B_in, inBytes); CeedChk_Cu(ceed, ierr);
1519   for (int i = 0; i < num_B_in_mats_to_load; i++) {
1520     CeedEvalMode eval_mode = eval_mode_in[i];
1521     if (eval_mode == CEED_EVAL_INTERP) {
1522       ierr = cudaMemcpy(&asmb->d_B_in[mat_start], interp_in,
1523                         esize * nqpts * sizeof(CeedScalar),
1524                         cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1525       mat_start += esize * nqpts;
1526     } else if (eval_mode == CEED_EVAL_GRAD) {
1527       ierr = cudaMemcpy(asmb->d_B_in, grad_in,
1528                         dim * esize * nqpts * sizeof(CeedScalar),
1529                         cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1530       mat_start += dim * esize * nqpts;
1531     }
1532   }
1533 
1534   const CeedScalar *interp_out, *grad_out;
1535   // Note that this function currently assumes 1 basis, so this should always be true
1536   // for now
1537   if (basis_out == basis_in) {
1538     interp_out = interp_in;
1539     grad_out = grad_in;
1540   } else {
1541     ierr = CeedBasisGetInterp(basis_out, &interp_out); CeedChkBackend(ierr);
1542     ierr = CeedBasisGetGrad(basis_out, &grad_out); CeedChkBackend(ierr);
1543   }
1544 
1545   // Load into B_out, in order that they will be used in eval_mode
1546   const CeedInt outBytes = size_B_out * sizeof(CeedScalar);
1547   mat_start = 0;
1548   ierr = cudaMalloc((void **) &asmb->d_B_out, outBytes); CeedChk_Cu(ceed, ierr);
1549   for (int i = 0; i < num_B_out_mats_to_load; i++) {
1550     CeedEvalMode eval_mode = eval_mode_out[i];
1551     if (eval_mode == CEED_EVAL_INTERP) {
1552       ierr = cudaMemcpy(&asmb->d_B_out[mat_start], interp_out,
1553                         esize * nqpts * sizeof(CeedScalar),
1554                         cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1555       mat_start += esize * nqpts;
1556     } else if (eval_mode == CEED_EVAL_GRAD) {
1557       ierr = cudaMemcpy(&asmb->d_B_out[mat_start], grad_out,
1558                         dim * esize * nqpts * sizeof(CeedScalar),
1559                         cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1560       mat_start += dim * esize * nqpts;
1561     }
1562   }
1563   return CEED_ERROR_SUCCESS;
1564 }
1565 
1566 //------------------------------------------------------------------------------
1567 // Single operator assembly
1568 //------------------------------------------------------------------------------
1569 static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset,
1570     CeedVector values) {
1571 
1572   int ierr;
1573   Ceed ceed;
1574   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1575   CeedOperator_Cuda *impl;
1576   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1577 
1578   // Setup
1579   if (!impl->asmb) {
1580     ierr = CeedSingleOperatorAssembleSetup_Cuda(op);
1581     CeedChkBackend(ierr);
1582   }
1583 
1584   // Assemble QFunction
1585   CeedVector assembled_qf;
1586   CeedElemRestriction rstr_q;
1587   ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(
1588            op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
1589   ierr = CeedElemRestrictionDestroy(&rstr_q); CeedChkBackend(ierr);
1590   CeedScalar *values_array;
1591   ierr = CeedVectorGetArrayWrite(values, CEED_MEM_DEVICE, &values_array);
1592   CeedChkBackend(ierr);
1593   values_array += offset;
1594   const CeedScalar *qf_array;
1595   ierr = CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array);
1596   CeedChkBackend(ierr);
1597 
1598   // Compute B^T D B
1599   const CeedInt nelem = impl->asmb->nelem;
1600   const CeedInt elemsPerBlock = impl->asmb->elemsPerBlock;
1601   const CeedInt grid = nelem/elemsPerBlock+((
1602                          nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0);
1603   void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out,
1604                   &qf_array, &values_array
1605                  };
1606   ierr = CeedRunKernelDimCuda(ceed, impl->asmb->linearAssemble, grid,
1607                               impl->asmb->block_size_x, impl->asmb->block_size_y,
1608                               elemsPerBlock, args);
1609   CeedChkBackend(ierr);
1610 
1611 
1612   // Restore arrays
1613   ierr = CeedVectorRestoreArray(values, &values_array); CeedChkBackend(ierr);
1614   ierr = CeedVectorRestoreArrayRead(assembled_qf, &qf_array);
1615   CeedChkBackend(ierr);
1616 
1617   // Cleanup
1618   ierr = CeedVectorDestroy(&assembled_qf); CeedChkBackend(ierr);
1619 
1620   return CEED_ERROR_SUCCESS;
1621 }
1622 
1623 //------------------------------------------------------------------------------
1624 // Assemble matrix data for COO matrix of assembled operator.
1625 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic.
1626 //
1627 // Note that this (and other assembly routines) currently assume only one
1628 // active input restriction/basis per operator (could have multiple basis eval
1629 // modes).
1630 // TODO: allow multiple active input restrictions/basis objects
1631 //------------------------------------------------------------------------------
1632 int CeedOperatorLinearAssemble_Cuda(CeedOperator op, CeedVector values) {
1633 
1634   // As done in the default implementation, loop through suboperators
1635   // for composite operators, or call single operator assembly otherwise
1636   bool is_composite;
1637   CeedInt ierr;
1638   ierr = CeedOperatorIsComposite(op, &is_composite); CeedChk(ierr);
1639 
1640   CeedElemRestriction rstr;
1641   CeedInt num_elem, elem_size, num_comp;
1642 
1643   CeedInt offset = 0;
1644   if (is_composite) {
1645     CeedInt num_suboperators;
1646     ierr = CeedOperatorGetNumSub(op, &num_suboperators); CeedChk(ierr);
1647     CeedOperator *sub_operators;
1648     ierr = CeedOperatorGetSubList(op, &sub_operators); CeedChk(ierr);
1649     for (int k = 0; k < num_suboperators; ++k) {
1650       ierr = CeedSingleOperatorAssemble_Cuda(sub_operators[k], offset, values);
1651       CeedChk(ierr);
1652       ierr = CeedOperatorGetActiveElemRestriction(sub_operators[k], &rstr);
1653       CeedChk(ierr);
1654       ierr = CeedElemRestrictionGetNumElements(rstr, &num_elem); CeedChk(ierr);
1655       ierr = CeedElemRestrictionGetElementSize(rstr, &elem_size); CeedChk(ierr);
1656       ierr = CeedElemRestrictionGetNumComponents(rstr, &num_comp); CeedChk(ierr);
1657       offset += elem_size*num_comp * elem_size*num_comp * num_elem;
1658     }
1659   } else {
1660     ierr = CeedSingleOperatorAssemble_Cuda(op, offset, values); CeedChk(ierr);
1661   }
1662 
1663   return CEED_ERROR_SUCCESS;
1664 }
1665 //------------------------------------------------------------------------------
1666 // Create operator
1667 //------------------------------------------------------------------------------
1668 int CeedOperatorCreate_Cuda(CeedOperator op) {
1669   int ierr;
1670   Ceed ceed;
1671   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1672   CeedOperator_Cuda *impl;
1673 
1674   ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr);
1675   ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr);
1676 
1677   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction",
1678                                 CeedOperatorLinearAssembleQFunction_Cuda);
1679   CeedChkBackend(ierr);
1680   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1681                                 "LinearAssembleQFunctionUpdate",
1682                                 CeedOperatorLinearAssembleQFunctionUpdate_Cuda);
1683   CeedChkBackend(ierr);
1684   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal",
1685                                 CeedOperatorLinearAssembleAddDiagonal_Cuda);
1686   CeedChkBackend(ierr);
1687   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1688                                 "LinearAssembleAddPointBlockDiagonal",
1689                                 CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda);
1690   CeedChkBackend(ierr);
1691   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1692                                 "LinearAssemble", CeedOperatorLinearAssemble_Cuda);
1693   CeedChkBackend(ierr);
1694   ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd",
1695                                 CeedOperatorApplyAdd_Cuda); CeedChkBackend(ierr);
1696   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy",
1697                                 CeedOperatorDestroy_Cuda); CeedChkBackend(ierr);
1698   return CEED_ERROR_SUCCESS;
1699 }
1700 
1701 //------------------------------------------------------------------------------
1702 // Composite Operator Create
1703 //------------------------------------------------------------------------------
1704 int CeedCompositeOperatorCreate_Cuda(CeedOperator op) {
1705   int ierr;
1706   Ceed ceed;
1707   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1708 
1709   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal",
1710                                 CeedOperatorLinearAssembleAddDiagonal_Cuda);
1711   CeedChkBackend(ierr);
1712   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1713                                 "LinearAssembleAddPointBlockDiagonal",
1714                                 CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda);
1715   CeedChkBackend(ierr);
1716   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1717                                 "LinearAssemble", CeedOperatorLinearAssemble_Cuda);
1718   CeedChkBackend(ierr);
1719   return CEED_ERROR_SUCCESS;
1720 }
1721 //------------------------------------------------------------------------------
1722