xref: /libCEED/backends/cuda-ref/ceed-cuda-ref-operator.c (revision 9a3a46e1d4300a3ad2fa9b1d35445254cd604d16)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include <stdbool.h>
15 #include <string.h>
16 #include "ceed-cuda-ref.h"
17 #include "../cuda/ceed-cuda-compile.h"
18 
19 //------------------------------------------------------------------------------
20 // Destroy operator
21 //------------------------------------------------------------------------------
22 static int CeedOperatorDestroy_Cuda(CeedOperator op) {
23   int ierr;
24   CeedOperator_Cuda *impl;
25   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
26 
27   // Apply data
28   for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) {
29     ierr = CeedVectorDestroy(&impl->evecs[i]); CeedChkBackend(ierr);
30   }
31   ierr = CeedFree(&impl->evecs); CeedChkBackend(ierr);
32 
33   for (CeedInt i = 0; i < impl->numein; i++) {
34     ierr = CeedVectorDestroy(&impl->qvecsin[i]); CeedChkBackend(ierr);
35   }
36   ierr = CeedFree(&impl->qvecsin); CeedChkBackend(ierr);
37 
38   for (CeedInt i = 0; i < impl->numeout; i++) {
39     ierr = CeedVectorDestroy(&impl->qvecsout[i]); CeedChkBackend(ierr);
40   }
41   ierr = CeedFree(&impl->qvecsout); CeedChkBackend(ierr);
42 
43   // QFunction assembly data
44   for (CeedInt i=0; i<impl->qfnumactivein; i++) {
45     ierr = CeedVectorDestroy(&impl->qfactivein[i]); CeedChkBackend(ierr);
46   }
47   ierr = CeedFree(&impl->qfactivein); CeedChkBackend(ierr);
48 
49   // Diag data
50   if (impl->diag) {
51     Ceed ceed;
52     ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
53     CeedChk_Cu(ceed, cuModuleUnload(impl->diag->module));
54     ierr = CeedFree(&impl->diag->h_emodein); CeedChkBackend(ierr);
55     ierr = CeedFree(&impl->diag->h_emodeout); CeedChkBackend(ierr);
56     ierr = cudaFree(impl->diag->d_emodein); CeedChk_Cu(ceed, ierr);
57     ierr = cudaFree(impl->diag->d_emodeout); CeedChk_Cu(ceed, ierr);
58     ierr = cudaFree(impl->diag->d_identity); CeedChk_Cu(ceed, ierr);
59     ierr = cudaFree(impl->diag->d_interpin); CeedChk_Cu(ceed, ierr);
60     ierr = cudaFree(impl->diag->d_interpout); CeedChk_Cu(ceed, ierr);
61     ierr = cudaFree(impl->diag->d_gradin); CeedChk_Cu(ceed, ierr);
62     ierr = cudaFree(impl->diag->d_gradout); CeedChk_Cu(ceed, ierr);
63     ierr = CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr);
64     CeedChkBackend(ierr);
65     ierr = CeedVectorDestroy(&impl->diag->elemdiag); CeedChkBackend(ierr);
66     ierr = CeedVectorDestroy(&impl->diag->pbelemdiag); CeedChkBackend(ierr);
67   }
68   ierr = CeedFree(&impl->diag); CeedChkBackend(ierr);
69 
70   if (impl->asmb) {
71     Ceed ceed;
72     ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
73     CeedChk_Cu(ceed, cuModuleUnload(impl->asmb->module));
74     ierr = cudaFree(impl->asmb->d_B_in); CeedChk_Cu(ceed, ierr);
75     ierr = cudaFree(impl->asmb->d_B_out); CeedChk_Cu(ceed, ierr);
76   }
77   ierr = CeedFree(&impl->asmb); CeedChkBackend(ierr);
78 
79   ierr = CeedFree(&impl); CeedChkBackend(ierr);
80   return CEED_ERROR_SUCCESS;
81 }
82 
83 //------------------------------------------------------------------------------
84 // Setup infields or outfields
85 //------------------------------------------------------------------------------
86 static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op,
87                                         bool isinput, CeedVector *evecs,
88                                         CeedVector *qvecs, CeedInt starte,
89                                         CeedInt numfields, CeedInt Q,
90                                         CeedInt numelements) {
91   CeedInt dim, ierr, size;
92   CeedSize q_size;
93   Ceed ceed;
94   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
95   CeedBasis basis;
96   CeedElemRestriction Erestrict;
97   CeedOperatorField *opfields;
98   CeedQFunctionField *qffields;
99   CeedVector fieldvec;
100   bool strided;
101   bool skiprestrict;
102 
103   if (isinput) {
104     ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL);
105     CeedChkBackend(ierr);
106     ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL);
107     CeedChkBackend(ierr);
108   } else {
109     ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields);
110     CeedChkBackend(ierr);
111     ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields);
112     CeedChkBackend(ierr);
113   }
114 
115   // Loop over fields
116   for (CeedInt i = 0; i < numfields; i++) {
117     CeedEvalMode emode;
118     ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr);
119 
120     strided = false;
121     skiprestrict = false;
122     if (emode != CEED_EVAL_WEIGHT) {
123       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict);
124       CeedChkBackend(ierr);
125 
126       // Check whether this field can skip the element restriction:
127       // must be passive input, with emode NONE, and have a strided restriction with
128       // CEED_STRIDES_BACKEND.
129 
130       // First, check whether the field is input or output:
131       if (isinput) {
132         // Check for passive input:
133         ierr = CeedOperatorFieldGetVector(opfields[i], &fieldvec); CeedChkBackend(ierr);
134         if (fieldvec != CEED_VECTOR_ACTIVE) {
135           // Check emode
136           if (emode == CEED_EVAL_NONE) {
137             // Check for strided restriction
138             ierr = CeedElemRestrictionIsStrided(Erestrict, &strided);
139             CeedChkBackend(ierr);
140             if (strided) {
141               // Check if vector is already in preferred backend ordering
142               ierr = CeedElemRestrictionHasBackendStrides(Erestrict,
143                      &skiprestrict); CeedChkBackend(ierr);
144             }
145           }
146         }
147       }
148       if (skiprestrict) {
149         // We do not need an E-Vector, but will use the input field vector's data
150         // directly in the operator application.
151         evecs[i + starte] = NULL;
152       } else {
153         ierr = CeedElemRestrictionCreateVector(Erestrict, NULL,
154                                                &evecs[i + starte]);
155         CeedChkBackend(ierr);
156       }
157     }
158 
159     switch (emode) {
160     case CEED_EVAL_NONE:
161       ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr);
162       q_size = (CeedSize)numelements * Q * size;
163       ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr);
164       break;
165     case CEED_EVAL_INTERP:
166       ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr);
167       q_size = (CeedSize)numelements * Q * size;
168       ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr);
169       break;
170     case CEED_EVAL_GRAD:
171       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr);
172       ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr);
173       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
174       q_size = (CeedSize)numelements * Q * size;
175       ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr);
176       break;
177     case CEED_EVAL_WEIGHT: // Only on input fields
178       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr);
179       q_size = (CeedSize)numelements * Q;
180       ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr);
181       ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE,
182                             CEED_EVAL_WEIGHT, NULL, qvecs[i]); CeedChkBackend(ierr);
183       break;
184     case CEED_EVAL_DIV:
185       break; // TODO: Not implemented
186     case CEED_EVAL_CURL:
187       break; // TODO: Not implemented
188     }
189   }
190   return CEED_ERROR_SUCCESS;
191 }
192 
193 //------------------------------------------------------------------------------
194 // CeedOperator needs to connect all the named fields (be they active or passive)
195 //   to the named inputs and outputs of its CeedQFunction.
196 //------------------------------------------------------------------------------
197 static int CeedOperatorSetup_Cuda(CeedOperator op) {
198   int ierr;
199   bool setupdone;
200   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
201   if (setupdone)
202     return CEED_ERROR_SUCCESS;
203   Ceed ceed;
204   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
205   CeedOperator_Cuda *impl;
206   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
207   CeedQFunction qf;
208   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
209   CeedInt Q, numelements, numinputfields, numoutputfields;
210   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
211   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
212   CeedChkBackend(ierr);
213   CeedOperatorField *opinputfields, *opoutputfields;
214   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
215                                &numoutputfields, &opoutputfields);
216   CeedChkBackend(ierr);
217   CeedQFunctionField *qfinputfields, *qfoutputfields;
218   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
219   CeedChkBackend(ierr);
220 
221   // Allocate
222   ierr = CeedCalloc(numinputfields + numoutputfields, &impl->evecs);
223   CeedChkBackend(ierr);
224 
225   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin); CeedChkBackend(ierr);
226   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout); CeedChkBackend(ierr);
227 
228   impl->numein = numinputfields; impl->numeout = numoutputfields;
229 
230   // Set up infield and outfield evecs and qvecs
231   // Infields
232   ierr = CeedOperatorSetupFields_Cuda(qf, op, true,
233                                       impl->evecs, impl->qvecsin, 0,
234                                       numinputfields, Q, numelements);
235   CeedChkBackend(ierr);
236 
237   // Outfields
238   ierr = CeedOperatorSetupFields_Cuda(qf, op, false,
239                                       impl->evecs, impl->qvecsout,
240                                       numinputfields, numoutputfields, Q,
241                                       numelements); CeedChkBackend(ierr);
242 
243   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
244   return CEED_ERROR_SUCCESS;
245 }
246 
247 //------------------------------------------------------------------------------
248 // Setup Operator Inputs
249 //------------------------------------------------------------------------------
250 static inline int CeedOperatorSetupInputs_Cuda(CeedInt numinputfields,
251     CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
252     CeedVector invec, const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX],
253     CeedOperator_Cuda *impl, CeedRequest *request) {
254   CeedInt ierr;
255   CeedEvalMode emode;
256   CeedVector vec;
257   CeedElemRestriction Erestrict;
258 
259   for (CeedInt i = 0; i < numinputfields; i++) {
260     // Get input vector
261     ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
262     if (vec == CEED_VECTOR_ACTIVE) {
263       if (skipactive)
264         continue;
265       else
266         vec = invec;
267     }
268 
269     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
270     CeedChkBackend(ierr);
271     if (emode == CEED_EVAL_WEIGHT) { // Skip
272     } else {
273       // Get input vector
274       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
275       // Get input element restriction
276       ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
277       CeedChkBackend(ierr);
278       if (vec == CEED_VECTOR_ACTIVE)
279         vec = invec;
280       // Restrict, if necessary
281       if (!impl->evecs[i]) {
282         // No restriction for this field; read data directly from vec.
283         ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE,
284                                       (const CeedScalar **) &edata[i]);
285         CeedChkBackend(ierr);
286       } else {
287         ierr = CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec,
288                                         impl->evecs[i], request); CeedChkBackend(ierr);
289         // Get evec
290         ierr = CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE,
291                                       (const CeedScalar **) &edata[i]);
292         CeedChkBackend(ierr);
293       }
294     }
295   }
296   return CEED_ERROR_SUCCESS;
297 }
298 
299 //------------------------------------------------------------------------------
300 // Input Basis Action
301 //------------------------------------------------------------------------------
302 static inline int CeedOperatorInputBasis_Cuda(CeedInt numelements,
303     CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
304     CeedInt numinputfields, const bool skipactive,
305     CeedScalar *edata[2*CEED_FIELD_MAX],CeedOperator_Cuda *impl) {
306   CeedInt ierr;
307   CeedInt elemsize, size;
308   CeedElemRestriction Erestrict;
309   CeedEvalMode emode;
310   CeedBasis basis;
311 
312   for (CeedInt i=0; i<numinputfields; i++) {
313     // Skip active input
314     if (skipactive) {
315       CeedVector vec;
316       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
317       if (vec == CEED_VECTOR_ACTIVE)
318         continue;
319     }
320     // Get elemsize, emode, size
321     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
322     CeedChkBackend(ierr);
323     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
324     CeedChkBackend(ierr);
325     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
326     CeedChkBackend(ierr);
327     ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr);
328     // Basis action
329     switch (emode) {
330     case CEED_EVAL_NONE:
331       ierr = CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE,
332                                 CEED_USE_POINTER, edata[i]); CeedChkBackend(ierr);
333       break;
334     case CEED_EVAL_INTERP:
335       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis);
336       CeedChkBackend(ierr);
337       ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE,
338                             CEED_EVAL_INTERP, impl->evecs[i],
339                             impl->qvecsin[i]); CeedChkBackend(ierr);
340       break;
341     case CEED_EVAL_GRAD:
342       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis);
343       CeedChkBackend(ierr);
344       ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE,
345                             CEED_EVAL_GRAD, impl->evecs[i],
346                             impl->qvecsin[i]); CeedChkBackend(ierr);
347       break;
348     case CEED_EVAL_WEIGHT:
349       break; // No action
350     case CEED_EVAL_DIV:
351       break; // TODO: Not implemented
352     case CEED_EVAL_CURL:
353       break; // TODO: Not implemented
354     }
355   }
356   return CEED_ERROR_SUCCESS;
357 }
358 
359 //------------------------------------------------------------------------------
360 // Restore Input Vectors
361 //------------------------------------------------------------------------------
362 static inline int CeedOperatorRestoreInputs_Cuda(CeedInt numinputfields,
363     CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
364     const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX],
365     CeedOperator_Cuda *impl) {
366   CeedInt ierr;
367   CeedEvalMode emode;
368   CeedVector vec;
369 
370   for (CeedInt i = 0; i < numinputfields; i++) {
371     // Skip active input
372     if (skipactive) {
373       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
374       if (vec == CEED_VECTOR_ACTIVE)
375         continue;
376     }
377     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
378     CeedChkBackend(ierr);
379     if (emode == CEED_EVAL_WEIGHT) { // Skip
380     } else {
381       if (!impl->evecs[i]) {  // This was a skiprestrict case
382         ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
383         ierr = CeedVectorRestoreArrayRead(vec,
384                                           (const CeedScalar **)&edata[i]);
385         CeedChkBackend(ierr);
386       } else {
387         ierr = CeedVectorRestoreArrayRead(impl->evecs[i],
388                                           (const CeedScalar **) &edata[i]);
389         CeedChkBackend(ierr);
390       }
391     }
392   }
393   return CEED_ERROR_SUCCESS;
394 }
395 
396 //------------------------------------------------------------------------------
397 // Apply and add to output
398 //------------------------------------------------------------------------------
399 static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector invec,
400                                      CeedVector outvec, CeedRequest *request) {
401   int ierr;
402   CeedOperator_Cuda *impl;
403   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
404   CeedQFunction qf;
405   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
406   CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size;
407   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
408   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
409   CeedChkBackend(ierr);
410   CeedOperatorField *opinputfields, *opoutputfields;
411   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
412                                &numoutputfields, &opoutputfields);
413   CeedChkBackend(ierr);
414   CeedQFunctionField *qfinputfields, *qfoutputfields;
415   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
416   CeedChkBackend(ierr);
417   CeedEvalMode emode;
418   CeedVector vec;
419   CeedBasis basis;
420   CeedElemRestriction Erestrict;
421   CeedScalar *edata[2*CEED_FIELD_MAX] = {0};
422 
423   // Setup
424   ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr);
425 
426   // Input Evecs and Restriction
427   ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields,
428                                       opinputfields, invec, false, edata,
429                                       impl, request); CeedChkBackend(ierr);
430 
431   // Input basis apply if needed
432   ierr = CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields,
433                                      numinputfields, false, edata, impl);
434   CeedChkBackend(ierr);
435 
436   // Output pointers, as necessary
437   for (CeedInt i = 0; i < numoutputfields; i++) {
438     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
439     CeedChkBackend(ierr);
440     if (emode == CEED_EVAL_NONE) {
441       // Set the output Q-Vector to use the E-Vector data directly.
442       ierr = CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE,
443                                      &edata[i + numinputfields]); CeedChkBackend(ierr);
444       ierr = CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE,
445                                 CEED_USE_POINTER, edata[i + numinputfields]);
446       CeedChkBackend(ierr);
447     }
448   }
449 
450   // Q function
451   ierr = CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout);
452   CeedChkBackend(ierr);
453 
454   // Output basis apply if needed
455   for (CeedInt i = 0; i < numoutputfields; i++) {
456     // Get elemsize, emode, size
457     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
458     CeedChkBackend(ierr);
459     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
460     CeedChkBackend(ierr);
461     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
462     CeedChkBackend(ierr);
463     ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size);
464     CeedChkBackend(ierr);
465     // Basis action
466     switch (emode) {
467     case CEED_EVAL_NONE:
468       break;
469     case CEED_EVAL_INTERP:
470       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
471       CeedChkBackend(ierr);
472       ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE,
473                             CEED_EVAL_INTERP, impl->qvecsout[i],
474                             impl->evecs[i + impl->numein]); CeedChkBackend(ierr);
475       break;
476     case CEED_EVAL_GRAD:
477       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
478       CeedChkBackend(ierr);
479       ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE,
480                             CEED_EVAL_GRAD, impl->qvecsout[i],
481                             impl->evecs[i + impl->numein]); CeedChkBackend(ierr);
482       break;
483     // LCOV_EXCL_START
484     case CEED_EVAL_WEIGHT: {
485       Ceed ceed;
486       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
487       return CeedError(ceed, CEED_ERROR_BACKEND,
488                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
489       break; // Should not occur
490     }
491     case CEED_EVAL_DIV:
492       break; // TODO: Not implemented
493     case CEED_EVAL_CURL:
494       break; // TODO: Not implemented
495       // LCOV_EXCL_STOP
496     }
497   }
498 
499   // Output restriction
500   for (CeedInt i = 0; i < numoutputfields; i++) {
501     // Restore evec
502     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
503     CeedChkBackend(ierr);
504     if (emode == CEED_EVAL_NONE) {
505       ierr = CeedVectorRestoreArray(impl->evecs[i+impl->numein],
506                                     &edata[i + numinputfields]);
507       CeedChkBackend(ierr);
508     }
509     // Get output vector
510     ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec);
511     CeedChkBackend(ierr);
512     // Restrict
513     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
514     CeedChkBackend(ierr);
515     // Active
516     if (vec == CEED_VECTOR_ACTIVE)
517       vec = outvec;
518 
519     ierr = CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE,
520                                     impl->evecs[i + impl->numein], vec,
521                                     request); CeedChkBackend(ierr);
522   }
523 
524   // Restore input arrays
525   ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields,
526                                         opinputfields, false, edata, impl);
527   CeedChkBackend(ierr);
528   return CEED_ERROR_SUCCESS;
529 }
530 
531 //------------------------------------------------------------------------------
532 // Core code for assembling linear QFunction
533 //------------------------------------------------------------------------------
534 static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
535     bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
536     CeedRequest *request) {
537   int ierr;
538   CeedOperator_Cuda *impl;
539   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
540   CeedQFunction qf;
541   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
542   CeedInt Q, numelements, numinputfields, numoutputfields, size;
543   CeedSize q_size;
544   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
545   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
546   CeedChkBackend(ierr);
547   CeedOperatorField *opinputfields, *opoutputfields;
548   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
549                                &numoutputfields, &opoutputfields);
550   CeedChkBackend(ierr);
551   CeedQFunctionField *qfinputfields, *qfoutputfields;
552   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
553   CeedChkBackend(ierr);
554   CeedVector vec;
555   CeedInt numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout;
556   CeedVector *activein = impl->qfactivein;
557   CeedScalar *a, *tmp;
558   Ceed ceed, ceedparent;
559   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
560   ierr = CeedGetOperatorFallbackParentCeed(ceed, &ceedparent);
561   CeedChkBackend(ierr);
562   ceedparent = ceedparent ? ceedparent : ceed;
563   CeedScalar *edata[2*CEED_FIELD_MAX];
564 
565   // Setup
566   ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr);
567 
568   // Check for identity
569   bool identityqf;
570   ierr = CeedQFunctionIsIdentity(qf, &identityqf); CeedChkBackend(ierr);
571   if (identityqf)
572     // LCOV_EXCL_START
573     return CeedError(ceed, CEED_ERROR_BACKEND,
574                      "Assembling identity QFunctions not supported");
575   // LCOV_EXCL_STOP
576 
577   // Input Evecs and Restriction
578   ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields,
579                                       opinputfields, NULL, true, edata,
580                                       impl, request); CeedChkBackend(ierr);
581 
582   // Count number of active input fields
583   if (!numactivein) {
584     for (CeedInt i=0; i<numinputfields; i++) {
585       // Get input vector
586       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
587       // Check if active input
588       if (vec == CEED_VECTOR_ACTIVE) {
589         ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr);
590         ierr = CeedVectorSetValue(impl->qvecsin[i], 0.0); CeedChkBackend(ierr);
591         ierr = CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp);
592         CeedChkBackend(ierr);
593         ierr = CeedRealloc(numactivein + size, &activein); CeedChkBackend(ierr);
594         for (CeedInt field = 0; field < size; field++) {
595           q_size = (CeedSize)Q*numelements;
596           ierr = CeedVectorCreate(ceed, q_size, &activein[numactivein+field]);
597           CeedChkBackend(ierr);
598           ierr = CeedVectorSetArray(activein[numactivein+field], CEED_MEM_DEVICE,
599                                     CEED_USE_POINTER, &tmp[field*Q*numelements]);
600           CeedChkBackend(ierr);
601         }
602         numactivein += size;
603         ierr = CeedVectorRestoreArray(impl->qvecsin[i], &tmp); CeedChkBackend(ierr);
604       }
605     }
606     impl->qfnumactivein = numactivein;
607     impl->qfactivein = activein;
608   }
609 
610   // Count number of active output fields
611   if (!numactiveout) {
612     for (CeedInt i=0; i<numoutputfields; i++) {
613       // Get output vector
614       ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec);
615       CeedChkBackend(ierr);
616       // Check if active output
617       if (vec == CEED_VECTOR_ACTIVE) {
618         ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size);
619         CeedChkBackend(ierr);
620         numactiveout += size;
621       }
622     }
623     impl->qfnumactiveout = numactiveout;
624   }
625 
626   // Check sizes
627   if (!numactivein || !numactiveout)
628     // LCOV_EXCL_START
629     return CeedError(ceed, CEED_ERROR_BACKEND,
630                      "Cannot assemble QFunction without active inputs "
631                      "and outputs");
632   // LCOV_EXCL_STOP
633 
634   // Build objects if needed
635   if (build_objects) {
636     // Create output restriction
637     CeedInt strides[3] = {1, numelements*Q, Q}; /* *NOPAD* */
638     ierr = CeedElemRestrictionCreateStrided(ceedparent, numelements, Q,
639                                             numactivein*numactiveout,
640                                             numactivein*numactiveout*numelements*Q,
641                                             strides, rstr); CeedChkBackend(ierr);
642     // Create assembled vector
643     CeedSize l_size = (CeedSize)numelements*Q*numactivein*numactiveout;
644     ierr = CeedVectorCreate(ceedparent, l_size, assembled); CeedChkBackend(ierr);
645   }
646   ierr = CeedVectorSetValue(*assembled, 0.0); CeedChkBackend(ierr);
647   ierr = CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &a);
648   CeedChkBackend(ierr);
649 
650   // Input basis apply
651   ierr = CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields,
652                                      numinputfields, true, edata, impl);
653   CeedChkBackend(ierr);
654 
655   // Assemble QFunction
656   for (CeedInt in=0; in<numactivein; in++) {
657     // Set Inputs
658     ierr = CeedVectorSetValue(activein[in], 1.0); CeedChkBackend(ierr);
659     if (numactivein > 1) {
660       ierr = CeedVectorSetValue(activein[(in+numactivein-1)%numactivein],
661                                 0.0); CeedChkBackend(ierr);
662     }
663     // Set Outputs
664     for (CeedInt out=0; out<numoutputfields; out++) {
665       // Get output vector
666       ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec);
667       CeedChkBackend(ierr);
668       // Check if active output
669       if (vec == CEED_VECTOR_ACTIVE) {
670         CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE,
671                            CEED_USE_POINTER, a); CeedChkBackend(ierr);
672         ierr = CeedQFunctionFieldGetSize(qfoutputfields[out], &size);
673         CeedChkBackend(ierr);
674         a += size*Q*numelements; // Advance the pointer by the size of the output
675       }
676     }
677     // Apply QFunction
678     ierr = CeedQFunctionApply(qf, Q*numelements, impl->qvecsin, impl->qvecsout);
679     CeedChkBackend(ierr);
680   }
681 
682   // Un-set output Qvecs to prevent accidental overwrite of Assembled
683   for (CeedInt out=0; out<numoutputfields; out++) {
684     // Get output vector
685     ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec);
686     CeedChkBackend(ierr);
687     // Check if active output
688     if (vec == CEED_VECTOR_ACTIVE) {
689       ierr = CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL);
690       CeedChkBackend(ierr);
691     }
692   }
693 
694   // Restore input arrays
695   ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields,
696                                         opinputfields, true, edata, impl);
697   CeedChkBackend(ierr);
698 
699   // Restore output
700   ierr = CeedVectorRestoreArray(*assembled, &a); CeedChkBackend(ierr);
701 
702   return CEED_ERROR_SUCCESS;
703 }
704 
705 //------------------------------------------------------------------------------
706 // Assemble Linear QFunction
707 //------------------------------------------------------------------------------
708 static int CeedOperatorLinearAssembleQFunction_Cuda(CeedOperator op,
709     CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
710   return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, true, assembled, rstr,
711          request);
712 }
713 
714 //------------------------------------------------------------------------------
715 // Update Assembled Linear QFunction
716 //------------------------------------------------------------------------------
717 static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op,
718     CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
719   return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, false, &assembled,
720          &rstr, request);
721 }
722 
723 //------------------------------------------------------------------------------
724 // Create point block restriction
725 //------------------------------------------------------------------------------
726 static int CreatePBRestriction(CeedElemRestriction rstr,
727                                CeedElemRestriction *pbRstr) {
728   int ierr;
729   Ceed ceed;
730   ierr = CeedElemRestrictionGetCeed(rstr, &ceed); CeedChkBackend(ierr);
731   const CeedInt *offsets;
732   ierr = CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets);
733   CeedChkBackend(ierr);
734 
735   // Expand offsets
736   CeedInt nelem, ncomp, elemsize, compstride, max = 1, *pbOffsets;
737   ierr = CeedElemRestrictionGetNumElements(rstr, &nelem); CeedChkBackend(ierr);
738   ierr = CeedElemRestrictionGetNumComponents(rstr, &ncomp); CeedChkBackend(ierr);
739   ierr = CeedElemRestrictionGetElementSize(rstr, &elemsize); CeedChkBackend(ierr);
740   ierr = CeedElemRestrictionGetCompStride(rstr, &compstride);
741   CeedChkBackend(ierr);
742   CeedInt shift = ncomp;
743   if (compstride != 1)
744     shift *= ncomp;
745   ierr = CeedCalloc(nelem*elemsize, &pbOffsets); CeedChkBackend(ierr);
746   for (CeedInt i = 0; i < nelem*elemsize; i++) {
747     pbOffsets[i] = offsets[i]*shift;
748     if (pbOffsets[i] > max)
749       max = pbOffsets[i];
750   }
751 
752   // Create new restriction
753   ierr = CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp*ncomp, 1,
754                                    max + ncomp*ncomp, CEED_MEM_HOST,
755                                    CEED_OWN_POINTER, pbOffsets, pbRstr);
756   CeedChkBackend(ierr);
757 
758   // Cleanup
759   ierr = CeedElemRestrictionRestoreOffsets(rstr, &offsets); CeedChkBackend(ierr);
760 
761   return CEED_ERROR_SUCCESS;
762 }
763 
764 //------------------------------------------------------------------------------
765 // Assemble diagonal setup
766 //------------------------------------------------------------------------------
767 static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op,
768     const bool pointBlock) {
769   int ierr;
770   Ceed ceed;
771   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
772   CeedQFunction qf;
773   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
774   CeedInt numinputfields, numoutputfields;
775   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
776   CeedChkBackend(ierr);
777 
778   // Determine active input basis
779   CeedOperatorField *opfields;
780   CeedQFunctionField *qffields;
781   ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL);
782   CeedChkBackend(ierr);
783   ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL);
784   CeedChkBackend(ierr);
785   CeedInt numemodein = 0, ncomp = 0, dim = 1;
786   CeedEvalMode *emodein = NULL;
787   CeedBasis basisin = NULL;
788   CeedElemRestriction rstrin = NULL;
789   for (CeedInt i = 0; i < numinputfields; i++) {
790     CeedVector vec;
791     ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr);
792     if (vec == CEED_VECTOR_ACTIVE) {
793       CeedElemRestriction rstr;
794       ierr = CeedOperatorFieldGetBasis(opfields[i], &basisin); CeedChkBackend(ierr);
795       ierr = CeedBasisGetNumComponents(basisin, &ncomp); CeedChkBackend(ierr);
796       ierr = CeedBasisGetDimension(basisin, &dim); CeedChkBackend(ierr);
797       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr);
798       CeedChkBackend(ierr);
799       if (rstrin && rstrin != rstr)
800         // LCOV_EXCL_START
801         return CeedError(ceed, CEED_ERROR_BACKEND,
802                          "Multi-field non-composite operator diagonal assembly not supported");
803       // LCOV_EXCL_STOP
804       rstrin = rstr;
805       CeedEvalMode emode;
806       ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode);
807       CeedChkBackend(ierr);
808       switch (emode) {
809       case CEED_EVAL_NONE:
810       case CEED_EVAL_INTERP:
811         ierr = CeedRealloc(numemodein + 1, &emodein); CeedChkBackend(ierr);
812         emodein[numemodein] = emode;
813         numemodein += 1;
814         break;
815       case CEED_EVAL_GRAD:
816         ierr = CeedRealloc(numemodein + dim, &emodein); CeedChkBackend(ierr);
817         for (CeedInt d = 0; d < dim; d++)
818           emodein[numemodein+d] = emode;
819         numemodein += dim;
820         break;
821       case CEED_EVAL_WEIGHT:
822       case CEED_EVAL_DIV:
823       case CEED_EVAL_CURL:
824         break; // Caught by QF Assembly
825       }
826     }
827   }
828 
829   // Determine active output basis
830   ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields);
831   CeedChkBackend(ierr);
832   ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields);
833   CeedChkBackend(ierr);
834   CeedInt numemodeout = 0;
835   CeedEvalMode *emodeout = NULL;
836   CeedBasis basisout = NULL;
837   CeedElemRestriction rstrout = NULL;
838   for (CeedInt i = 0; i < numoutputfields; i++) {
839     CeedVector vec;
840     ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr);
841     if (vec == CEED_VECTOR_ACTIVE) {
842       CeedElemRestriction rstr;
843       ierr = CeedOperatorFieldGetBasis(opfields[i], &basisout); CeedChkBackend(ierr);
844       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr);
845       CeedChkBackend(ierr);
846       if (rstrout && rstrout != rstr)
847         // LCOV_EXCL_START
848         return CeedError(ceed, CEED_ERROR_BACKEND,
849                          "Multi-field non-composite operator diagonal assembly not supported");
850       // LCOV_EXCL_STOP
851       rstrout = rstr;
852       CeedEvalMode emode;
853       ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr);
854       switch (emode) {
855       case CEED_EVAL_NONE:
856       case CEED_EVAL_INTERP:
857         ierr = CeedRealloc(numemodeout + 1, &emodeout); CeedChkBackend(ierr);
858         emodeout[numemodeout] = emode;
859         numemodeout += 1;
860         break;
861       case CEED_EVAL_GRAD:
862         ierr = CeedRealloc(numemodeout + dim, &emodeout); CeedChkBackend(ierr);
863         for (CeedInt d = 0; d < dim; d++)
864           emodeout[numemodeout+d] = emode;
865         numemodeout += dim;
866         break;
867       case CEED_EVAL_WEIGHT:
868       case CEED_EVAL_DIV:
869       case CEED_EVAL_CURL:
870         break; // Caught by QF Assembly
871       }
872     }
873   }
874 
875   // Operator data struct
876   CeedOperator_Cuda *impl;
877   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
878   ierr = CeedCalloc(1, &impl->diag); CeedChkBackend(ierr);
879   CeedOperatorDiag_Cuda *diag = impl->diag;
880   diag->basisin = basisin;
881   diag->basisout = basisout;
882   diag->h_emodein = emodein;
883   diag->h_emodeout = emodeout;
884   diag->numemodein = numemodein;
885   diag->numemodeout = numemodeout;
886 
887   // Assemble kernel
888   char *diagonal_kernel_path, *diagonal_kernel_source;
889   ierr = CeedGetJitAbsolutePath(ceed,
890                                 "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h",
891                                 &diagonal_kernel_path); CeedChkBackend(ierr);
892   CeedDebug256(ceed, 2, "----- Loading Diagonal Assembly Kernel Source -----\n");
893   ierr = CeedLoadSourceToBuffer(ceed, diagonal_kernel_path,
894                                 &diagonal_kernel_source);
895   CeedChkBackend(ierr);
896   CeedDebug256(ceed, 2,
897                "----- Loading Diagonal Assembly Source Complete! -----\n");
898   CeedInt nnodes, nqpts;
899   ierr = CeedBasisGetNumNodes(basisin, &nnodes); CeedChkBackend(ierr);
900   ierr = CeedBasisGetNumQuadraturePoints(basisin, &nqpts); CeedChkBackend(ierr);
901   diag->nnodes = nnodes;
902   ierr = CeedCompileCuda(ceed, diagonal_kernel_source, &diag->module, 5,
903                          "NUMEMODEIN", numemodein,
904                          "NUMEMODEOUT", numemodeout,
905                          "NNODES", nnodes,
906                          "NQPTS", nqpts,
907                          "NCOMP", ncomp
908                         ); CeedChk_Cu(ceed, ierr);
909   ierr = CeedGetKernelCuda(ceed, diag->module, "linearDiagonal",
910                            &diag->linearDiagonal); CeedChk_Cu(ceed, ierr);
911   ierr = CeedGetKernelCuda(ceed, diag->module, "linearPointBlockDiagonal",
912                            &diag->linearPointBlock);
913   CeedChk_Cu(ceed, ierr);
914   ierr = CeedFree(&diagonal_kernel_path); CeedChkBackend(ierr);
915   ierr = CeedFree(&diagonal_kernel_source); CeedChkBackend(ierr);
916 
917   // Basis matrices
918   const CeedInt qBytes = nqpts * sizeof(CeedScalar);
919   const CeedInt iBytes = qBytes * nnodes;
920   const CeedInt gBytes = qBytes * nnodes * dim;
921   const CeedInt eBytes = sizeof(CeedEvalMode);
922   const CeedScalar *interpin, *interpout, *gradin, *gradout;
923 
924   // CEED_EVAL_NONE
925   CeedScalar *identity = NULL;
926   bool evalNone = false;
927   for (CeedInt i=0; i<numemodein; i++)
928     evalNone = evalNone || (emodein[i] == CEED_EVAL_NONE);
929   for (CeedInt i=0; i<numemodeout; i++)
930     evalNone = evalNone || (emodeout[i] == CEED_EVAL_NONE);
931   if (evalNone) {
932     ierr = CeedCalloc(nqpts*nnodes, &identity); CeedChkBackend(ierr);
933     for (CeedInt i=0; i<(nnodes<nqpts?nnodes:nqpts); i++)
934       identity[i*nnodes+i] = 1.0;
935     ierr = cudaMalloc((void **)&diag->d_identity, iBytes); CeedChk_Cu(ceed, ierr);
936     ierr = cudaMemcpy(diag->d_identity, identity, iBytes,
937                       cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
938   }
939 
940   // CEED_EVAL_INTERP
941   ierr = CeedBasisGetInterp(basisin, &interpin); CeedChkBackend(ierr);
942   ierr = cudaMalloc((void **)&diag->d_interpin, iBytes); CeedChk_Cu(ceed, ierr);
943   ierr = cudaMemcpy(diag->d_interpin, interpin, iBytes,
944                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
945   ierr = CeedBasisGetInterp(basisout, &interpout); CeedChkBackend(ierr);
946   ierr = cudaMalloc((void **)&diag->d_interpout, iBytes); CeedChk_Cu(ceed, ierr);
947   ierr = cudaMemcpy(diag->d_interpout, interpout, iBytes,
948                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
949 
950   // CEED_EVAL_GRAD
951   ierr = CeedBasisGetGrad(basisin, &gradin); CeedChkBackend(ierr);
952   ierr = cudaMalloc((void **)&diag->d_gradin, gBytes); CeedChk_Cu(ceed, ierr);
953   ierr = cudaMemcpy(diag->d_gradin, gradin, gBytes,
954                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
955   ierr = CeedBasisGetGrad(basisout, &gradout); CeedChkBackend(ierr);
956   ierr = cudaMalloc((void **)&diag->d_gradout, gBytes); CeedChk_Cu(ceed, ierr);
957   ierr = cudaMemcpy(diag->d_gradout, gradout, gBytes,
958                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
959 
960   // Arrays of emodes
961   ierr = cudaMalloc((void **)&diag->d_emodein, numemodein * eBytes);
962   CeedChk_Cu(ceed, ierr);
963   ierr = cudaMemcpy(diag->d_emodein, emodein, numemodein * eBytes,
964                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
965   ierr = cudaMalloc((void **)&diag->d_emodeout, numemodeout * eBytes);
966   CeedChk_Cu(ceed, ierr);
967   ierr = cudaMemcpy(diag->d_emodeout, emodeout, numemodeout * eBytes,
968                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
969 
970   // Restriction
971   diag->diagrstr = rstrout;
972 
973   return CEED_ERROR_SUCCESS;
974 }
975 
976 //------------------------------------------------------------------------------
977 // Assemble diagonal common code
978 //------------------------------------------------------------------------------
979 static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op,
980     CeedVector assembled, CeedRequest *request, const bool pointBlock) {
981   int ierr;
982   Ceed ceed;
983   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
984   CeedOperator_Cuda *impl;
985   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
986 
987   // Assemble QFunction
988   CeedVector assembledqf;
989   CeedElemRestriction rstr;
990   ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf,
991          &rstr, request); CeedChkBackend(ierr);
992   ierr = CeedElemRestrictionDestroy(&rstr); CeedChkBackend(ierr);
993   CeedScalar maxnorm = 0;
994   ierr = CeedVectorNorm(assembledqf, CEED_NORM_MAX, &maxnorm);
995   CeedChkBackend(ierr);
996 
997   // Setup
998   if (!impl->diag) {
999     ierr = CeedOperatorAssembleDiagonalSetup_Cuda(op, pointBlock);
1000     CeedChkBackend(ierr);
1001   }
1002   CeedOperatorDiag_Cuda *diag = impl->diag;
1003   assert(diag != NULL);
1004 
1005   // Restriction
1006   if (pointBlock && !diag->pbdiagrstr) {
1007     CeedElemRestriction pbdiagrstr;
1008     ierr = CreatePBRestriction(diag->diagrstr, &pbdiagrstr); CeedChkBackend(ierr);
1009     diag->pbdiagrstr = pbdiagrstr;
1010   }
1011   CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr;
1012 
1013   // Create diagonal vector
1014   CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag;
1015   if (!elemdiag) {
1016     ierr = CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag);
1017     CeedChkBackend(ierr);
1018     if (pointBlock)
1019       diag->pbelemdiag = elemdiag;
1020     else
1021       diag->elemdiag = elemdiag;
1022   }
1023   ierr = CeedVectorSetValue(elemdiag, 0.0); CeedChkBackend(ierr);
1024 
1025   // Assemble element operator diagonals
1026   CeedScalar *elemdiagarray;
1027   const CeedScalar *assembledqfarray;
1028   ierr = CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray);
1029   CeedChkBackend(ierr);
1030   ierr = CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray);
1031   CeedChkBackend(ierr);
1032   CeedInt nelem;
1033   ierr = CeedElemRestrictionGetNumElements(diagrstr, &nelem);
1034   CeedChkBackend(ierr);
1035 
1036   // Compute the diagonal of B^T D B
1037   int elemsPerBlock = 1;
1038   int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0);
1039   void *args[] = {(void *) &nelem, (void *) &maxnorm, &diag->d_identity,
1040                   &diag->d_interpin, &diag->d_gradin, &diag->d_interpout,
1041                   &diag->d_gradout, &diag->d_emodein, &diag->d_emodeout,
1042                   &assembledqfarray, &elemdiagarray
1043                  };
1044   if (pointBlock) {
1045     ierr = CeedRunKernelDimCuda(ceed, diag->linearPointBlock, grid,
1046                                 diag->nnodes, 1, elemsPerBlock, args);
1047     CeedChkBackend(ierr);
1048   } else {
1049     ierr = CeedRunKernelDimCuda(ceed, diag->linearDiagonal, grid,
1050                                 diag->nnodes, 1, elemsPerBlock, args);
1051     CeedChkBackend(ierr);
1052   }
1053 
1054   // Restore arrays
1055   ierr = CeedVectorRestoreArray(elemdiag, &elemdiagarray); CeedChkBackend(ierr);
1056   ierr = CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray);
1057   CeedChkBackend(ierr);
1058 
1059   // Assemble local operator diagonal
1060   ierr = CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag,
1061                                   assembled, request); CeedChkBackend(ierr);
1062 
1063   // Cleanup
1064   ierr = CeedVectorDestroy(&assembledqf); CeedChkBackend(ierr);
1065 
1066   return CEED_ERROR_SUCCESS;
1067 }
1068 
1069 //------------------------------------------------------------------------------
1070 // Assemble Linear Diagonal
1071 //------------------------------------------------------------------------------
1072 static int CeedOperatorLinearAssembleAddDiagonal_Cuda(CeedOperator op,
1073     CeedVector assembled, CeedRequest *request) {
1074   int ierr = CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, false);
1075   CeedChkBackend(ierr);
1076   return CEED_ERROR_SUCCESS;
1077 }
1078 
1079 //------------------------------------------------------------------------------
1080 // Assemble Linear Point Block Diagonal
1081 //------------------------------------------------------------------------------
1082 static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op,
1083     CeedVector assembled, CeedRequest *request) {
1084   int ierr = CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, true);
1085   CeedChkBackend(ierr);
1086   return CEED_ERROR_SUCCESS;
1087 }
1088 
1089 //------------------------------------------------------------------------------
1090 // Single operator assembly setup
1091 //------------------------------------------------------------------------------
1092 static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op) {
1093   int ierr;
1094   Ceed ceed;
1095   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1096   CeedOperator_Cuda *impl;
1097   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1098 
1099   // Get intput and output fields
1100   CeedInt num_input_fields, num_output_fields;
1101   CeedOperatorField *input_fields;
1102   CeedOperatorField *output_fields;
1103   ierr = CeedOperatorGetFields(op, &num_input_fields, &input_fields,
1104                                &num_output_fields, &output_fields); CeedChkBackend(ierr);
1105 
1106   // Determine active input basis eval mode
1107   CeedQFunction qf;
1108   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
1109   CeedQFunctionField *qf_fields;
1110   ierr = CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL);
1111   CeedChkBackend(ierr);
1112   // Note that the kernel will treat each dimension of a gradient action separately;
1113   // i.e., when an active input has a CEED_EVAL_GRAD mode, num_emode_in will increment
1114   // by dim.  However, for the purposes of loading the B matrices, it will be treated
1115   // as one mode, and we will load/copy the entire gradient matrix at once, so
1116   // num_B_in_mats_to_load will be incremented by 1.
1117   CeedInt num_emode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0;
1118   CeedEvalMode *eval_mode_in = NULL; //will be of size num_B_in_mats_load
1119   CeedBasis basis_in = NULL;
1120   CeedInt nqpts = 0, esize = 0;
1121   CeedElemRestriction rstr_in = NULL;
1122   for (CeedInt i=0; i<num_input_fields; i++) {
1123     CeedVector vec;
1124     ierr = CeedOperatorFieldGetVector(input_fields[i], &vec); CeedChkBackend(ierr);
1125     if (vec == CEED_VECTOR_ACTIVE) {
1126       ierr = CeedOperatorFieldGetBasis(input_fields[i], &basis_in);
1127       CeedChkBackend(ierr);
1128       ierr = CeedBasisGetDimension(basis_in, &dim); CeedChkBackend(ierr);
1129       ierr = CeedBasisGetNumQuadraturePoints(basis_in, &nqpts); CeedChkBackend(ierr);
1130       ierr = CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in);
1131       CeedChkBackend(ierr);
1132       ierr = CeedElemRestrictionGetElementSize(rstr_in, &esize); CeedChkBackend(ierr);
1133       CeedEvalMode eval_mode;
1134       ierr = CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode);
1135       CeedChkBackend(ierr);
1136       if (eval_mode != CEED_EVAL_NONE) {
1137         ierr = CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in);
1138         CeedChkBackend(ierr);
1139         eval_mode_in[num_B_in_mats_to_load] = eval_mode;
1140         num_B_in_mats_to_load += 1;
1141         if (eval_mode == CEED_EVAL_GRAD) {
1142           num_emode_in += dim;
1143           size_B_in += dim * esize * nqpts;
1144         } else {
1145           num_emode_in +=1;
1146           size_B_in += esize * nqpts;
1147         }
1148       }
1149     }
1150   }
1151 
1152   // Determine active output basis; basis_out and rstr_out only used if same as input, TODO
1153   ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields);
1154   CeedChkBackend(ierr);
1155   CeedInt num_emode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0;
1156   CeedEvalMode *eval_mode_out = NULL;
1157   CeedBasis basis_out = NULL;
1158   CeedElemRestriction rstr_out = NULL;
1159   for (CeedInt i=0; i<num_output_fields; i++) {
1160     CeedVector vec;
1161     ierr = CeedOperatorFieldGetVector(output_fields[i], &vec); CeedChkBackend(ierr);
1162     if (vec == CEED_VECTOR_ACTIVE) {
1163       ierr = CeedOperatorFieldGetBasis(output_fields[i], &basis_out);
1164       CeedChkBackend(ierr);
1165       ierr = CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out);
1166       CeedChkBackend(ierr);
1167       if (rstr_out && rstr_out != rstr_in)
1168         // LCOV_EXCL_START
1169         return CeedError(ceed, CEED_ERROR_BACKEND,
1170                          "Multi-field non-composite operator assembly not supported");
1171       // LCOV_EXCL_STOP
1172       CeedEvalMode eval_mode;
1173       ierr = CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode);
1174       CeedChkBackend(ierr);
1175       if (eval_mode != CEED_EVAL_NONE) {
1176         ierr = CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out);
1177         CeedChkBackend(ierr);
1178         eval_mode_out[num_B_out_mats_to_load] = eval_mode;
1179         num_B_out_mats_to_load += 1;
1180         if (eval_mode == CEED_EVAL_GRAD) {
1181           num_emode_out += dim;
1182           size_B_out += dim * esize * nqpts;
1183         } else {
1184           num_emode_out +=1;
1185           size_B_out += esize * nqpts;
1186         }
1187       }
1188     }
1189   }
1190 
1191   if (num_emode_in == 0 || num_emode_out == 0)
1192     // LCOV_EXCL_START
1193     return CeedError(ceed, CEED_ERROR_UNSUPPORTED,
1194                      "Cannot assemble operator without inputs/outputs");
1195   // LCOV_EXCL_STOP
1196 
1197   CeedInt nelem, ncomp;
1198   ierr = CeedElemRestrictionGetNumElements(rstr_in, &nelem); CeedChkBackend(ierr);
1199   ierr = CeedElemRestrictionGetNumComponents(rstr_in, &ncomp);
1200   CeedChkBackend(ierr);
1201 
1202   ierr = CeedCalloc(1, &impl->asmb); CeedChkBackend(ierr);
1203   CeedOperatorAssemble_Cuda *asmb = impl->asmb;
1204   asmb->nelem = nelem;
1205 
1206   // Compile kernels
1207   int elemsPerBlock = 1;
1208   asmb->elemsPerBlock = elemsPerBlock;
1209   CeedInt block_size = esize * esize * elemsPerBlock;
1210   Ceed_Cuda *cuda_data;
1211   ierr = CeedGetData(ceed, &cuda_data); CeedChkBackend(ierr);
1212   char *assembly_kernel_path, *assembly_kernel_source;
1213   ierr = CeedGetJitAbsolutePath(ceed,
1214                                 "ceed/jit-source/cuda/cuda-ref-operator-assemble.h",
1215                                 &assembly_kernel_path); CeedChkBackend(ierr);
1216   CeedDebug256(ceed, 2, "----- Loading Assembly Kernel Source -----\n");
1217   ierr = CeedLoadSourceToBuffer(ceed, assembly_kernel_path,
1218                                 &assembly_kernel_source);
1219   CeedChkBackend(ierr);
1220   CeedDebug256(ceed, 2, "----- Loading Assembly Source Complete! -----\n");
1221   bool fallback = block_size > cuda_data->device_prop.maxThreadsPerBlock;
1222   if (fallback) {
1223     // Use fallback kernel with 1D threadblock
1224     block_size = esize * elemsPerBlock;
1225     asmb->block_size_x = esize;
1226     asmb->block_size_y = 1;
1227   } else {  // Use kernel with 2D threadblock
1228     asmb->block_size_x = esize;
1229     asmb->block_size_y = esize;
1230   }
1231   ierr = CeedCompileCuda(ceed, assembly_kernel_source, &asmb->module, 7,
1232                          "NELEM", nelem,
1233                          "NUMEMODEIN", num_emode_in,
1234                          "NUMEMODEOUT", num_emode_out,
1235                          "NQPTS", nqpts,
1236                          "NNODES", esize,
1237                          "BLOCK_SIZE", block_size,
1238                          "NCOMP", ncomp
1239                         ); CeedChk_Cu(ceed, ierr);
1240   ierr = CeedGetKernelCuda(ceed, asmb->module,
1241                            fallback ? "linearAssembleFallback" : "linearAssemble",
1242                            &asmb->linearAssemble); CeedChk_Cu(ceed, ierr);
1243   ierr = CeedFree(&assembly_kernel_path); CeedChkBackend(ierr);
1244   ierr = CeedFree(&assembly_kernel_source); CeedChkBackend(ierr);
1245 
1246   // Build 'full' B matrices (not 1D arrays used for tensor-product matrices)
1247   const CeedScalar *interp_in, *grad_in;
1248   ierr = CeedBasisGetInterp(basis_in, &interp_in); CeedChkBackend(ierr);
1249   ierr = CeedBasisGetGrad(basis_in, &grad_in); CeedChkBackend(ierr);
1250 
1251   // Load into B_in, in order that they will be used in eval_mode
1252   const CeedInt inBytes = size_B_in * sizeof(CeedScalar);
1253   CeedInt mat_start = 0;
1254   ierr = cudaMalloc((void **) &asmb->d_B_in, inBytes); CeedChk_Cu(ceed, ierr);
1255   for (int i = 0; i < num_B_in_mats_to_load; i++) {
1256     CeedEvalMode eval_mode = eval_mode_in[i];
1257     if (eval_mode == CEED_EVAL_INTERP) {
1258       ierr = cudaMemcpy(&asmb->d_B_in[mat_start], interp_in,
1259                         esize * nqpts * sizeof(CeedScalar),
1260                         cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1261       mat_start += esize * nqpts;
1262     } else if (eval_mode == CEED_EVAL_GRAD) {
1263       ierr = cudaMemcpy(&asmb->d_B_in[mat_start], grad_in,
1264                         dim * esize * nqpts * sizeof(CeedScalar),
1265                         cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1266       mat_start += dim * esize * nqpts;
1267     }
1268   }
1269 
1270   const CeedScalar *interp_out, *grad_out;
1271   // Note that this function currently assumes 1 basis, so this should always be true
1272   // for now
1273   if (basis_out == basis_in) {
1274     interp_out = interp_in;
1275     grad_out = grad_in;
1276   } else {
1277     ierr = CeedBasisGetInterp(basis_out, &interp_out); CeedChkBackend(ierr);
1278     ierr = CeedBasisGetGrad(basis_out, &grad_out); CeedChkBackend(ierr);
1279   }
1280 
1281   // Load into B_out, in order that they will be used in eval_mode
1282   const CeedInt outBytes = size_B_out * sizeof(CeedScalar);
1283   mat_start = 0;
1284   ierr = cudaMalloc((void **) &asmb->d_B_out, outBytes); CeedChk_Cu(ceed, ierr);
1285   for (int i = 0; i < num_B_out_mats_to_load; i++) {
1286     CeedEvalMode eval_mode = eval_mode_out[i];
1287     if (eval_mode == CEED_EVAL_INTERP) {
1288       ierr = cudaMemcpy(&asmb->d_B_out[mat_start], interp_out,
1289                         esize * nqpts * sizeof(CeedScalar),
1290                         cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1291       mat_start += esize * nqpts;
1292     } else if (eval_mode == CEED_EVAL_GRAD) {
1293       ierr = cudaMemcpy(&asmb->d_B_out[mat_start], grad_out,
1294                         dim * esize * nqpts * sizeof(CeedScalar),
1295                         cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1296       mat_start += dim * esize * nqpts;
1297     }
1298   }
1299   return CEED_ERROR_SUCCESS;
1300 }
1301 
1302 //------------------------------------------------------------------------------
1303 // Assemble matrix data for COO matrix of assembled operator.
1304 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic.
1305 //
1306 // Note that this (and other assembly routines) currently assume only one
1307 // active input restriction/basis per operator (could have multiple basis eval
1308 // modes).
1309 // TODO: allow multiple active input restrictions/basis objects
1310 //------------------------------------------------------------------------------
1311 static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset,
1312     CeedVector values) {
1313 
1314   int ierr;
1315   Ceed ceed;
1316   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1317   CeedOperator_Cuda *impl;
1318   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1319 
1320   // Setup
1321   if (!impl->asmb) {
1322     ierr = CeedSingleOperatorAssembleSetup_Cuda(op);
1323     CeedChkBackend(ierr);
1324     assert(impl->asmb != NULL);
1325   }
1326 
1327   // Assemble QFunction
1328   CeedVector assembled_qf;
1329   CeedElemRestriction rstr_q;
1330   ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(
1331            op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE); CeedChkBackend(ierr);
1332   ierr = CeedElemRestrictionDestroy(&rstr_q); CeedChkBackend(ierr);
1333   CeedScalar *values_array;
1334   ierr = CeedVectorGetArrayWrite(values, CEED_MEM_DEVICE, &values_array);
1335   CeedChkBackend(ierr);
1336   values_array += offset;
1337   const CeedScalar *qf_array;
1338   ierr = CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array);
1339   CeedChkBackend(ierr);
1340 
1341   // Compute B^T D B
1342   const CeedInt nelem = impl->asmb->nelem;
1343   const CeedInt elemsPerBlock = impl->asmb->elemsPerBlock;
1344   const CeedInt grid = nelem/elemsPerBlock+((
1345                          nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0);
1346   void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out,
1347                   &qf_array, &values_array
1348                  };
1349   ierr = CeedRunKernelDimCuda(ceed, impl->asmb->linearAssemble, grid,
1350                               impl->asmb->block_size_x, impl->asmb->block_size_y,
1351                               elemsPerBlock, args);
1352   CeedChkBackend(ierr);
1353 
1354 
1355   // Restore arrays
1356   ierr = CeedVectorRestoreArray(values, &values_array); CeedChkBackend(ierr);
1357   ierr = CeedVectorRestoreArrayRead(assembled_qf, &qf_array);
1358   CeedChkBackend(ierr);
1359 
1360   // Cleanup
1361   ierr = CeedVectorDestroy(&assembled_qf); CeedChkBackend(ierr);
1362 
1363   return CEED_ERROR_SUCCESS;
1364 }
1365 
1366 //------------------------------------------------------------------------------
1367 // Create operator
1368 //------------------------------------------------------------------------------
1369 int CeedOperatorCreate_Cuda(CeedOperator op) {
1370   int ierr;
1371   Ceed ceed;
1372   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1373   CeedOperator_Cuda *impl;
1374 
1375   ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr);
1376   ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr);
1377 
1378   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction",
1379                                 CeedOperatorLinearAssembleQFunction_Cuda);
1380   CeedChkBackend(ierr);
1381   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1382                                 "LinearAssembleQFunctionUpdate",
1383                                 CeedOperatorLinearAssembleQFunctionUpdate_Cuda);
1384   CeedChkBackend(ierr);
1385   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal",
1386                                 CeedOperatorLinearAssembleAddDiagonal_Cuda);
1387   CeedChkBackend(ierr);
1388   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1389                                 "LinearAssembleAddPointBlockDiagonal",
1390                                 CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda);
1391   CeedChkBackend(ierr);
1392   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1393                                 "LinearAssembleSingle", CeedSingleOperatorAssemble_Cuda);
1394   CeedChkBackend(ierr);
1395   ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd",
1396                                 CeedOperatorApplyAdd_Cuda); CeedChkBackend(ierr);
1397   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy",
1398                                 CeedOperatorDestroy_Cuda); CeedChkBackend(ierr);
1399   return CEED_ERROR_SUCCESS;
1400 }
1401 
1402 //------------------------------------------------------------------------------
1403