xref: /libCEED/backends/cuda-ref/ceed-cuda-ref-operator.c (revision 3d8e882215d238700cdceb37404f76ca7fa24eaa)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/ceed.h>
9 #include <ceed/backend.h>
10 #include <assert.h>
11 #include <cuda.h>
12 #include <cuda_runtime.h>
13 #include <stdbool.h>
14 #include <string.h>
15 #include "ceed-cuda-ref.h"
16 #include "../cuda/ceed-cuda-compile.h"
17 
18 //------------------------------------------------------------------------------
19 // Destroy operator
20 //------------------------------------------------------------------------------
21 static int CeedOperatorDestroy_Cuda(CeedOperator op) {
22   int ierr;
23   CeedOperator_Cuda *impl;
24   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
25 
26   // Apply data
27   for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) {
28     ierr = CeedVectorDestroy(&impl->evecs[i]); CeedChkBackend(ierr);
29   }
30   ierr = CeedFree(&impl->evecs); CeedChkBackend(ierr);
31 
32   for (CeedInt i = 0; i < impl->numein; i++) {
33     ierr = CeedVectorDestroy(&impl->qvecsin[i]); CeedChkBackend(ierr);
34   }
35   ierr = CeedFree(&impl->qvecsin); CeedChkBackend(ierr);
36 
37   for (CeedInt i = 0; i < impl->numeout; i++) {
38     ierr = CeedVectorDestroy(&impl->qvecsout[i]); CeedChkBackend(ierr);
39   }
40   ierr = CeedFree(&impl->qvecsout); CeedChkBackend(ierr);
41 
42   // QFunction assembly data
43   for (CeedInt i=0; i<impl->qfnumactivein; i++) {
44     ierr = CeedVectorDestroy(&impl->qfactivein[i]); CeedChkBackend(ierr);
45   }
46   ierr = CeedFree(&impl->qfactivein); CeedChkBackend(ierr);
47 
48   // Diag data
49   if (impl->diag) {
50     Ceed ceed;
51     ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
52     CeedChk_Cu(ceed, cuModuleUnload(impl->diag->module));
53     ierr = CeedFree(&impl->diag->h_emodein); CeedChkBackend(ierr);
54     ierr = CeedFree(&impl->diag->h_emodeout); CeedChkBackend(ierr);
55     ierr = cudaFree(impl->diag->d_emodein); CeedChk_Cu(ceed, ierr);
56     ierr = cudaFree(impl->diag->d_emodeout); CeedChk_Cu(ceed, ierr);
57     ierr = cudaFree(impl->diag->d_identity); CeedChk_Cu(ceed, ierr);
58     ierr = cudaFree(impl->diag->d_interpin); CeedChk_Cu(ceed, ierr);
59     ierr = cudaFree(impl->diag->d_interpout); CeedChk_Cu(ceed, ierr);
60     ierr = cudaFree(impl->diag->d_gradin); CeedChk_Cu(ceed, ierr);
61     ierr = cudaFree(impl->diag->d_gradout); CeedChk_Cu(ceed, ierr);
62     ierr = CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr);
63     CeedChkBackend(ierr);
64     ierr = CeedVectorDestroy(&impl->diag->elemdiag); CeedChkBackend(ierr);
65     ierr = CeedVectorDestroy(&impl->diag->pbelemdiag); CeedChkBackend(ierr);
66   }
67   ierr = CeedFree(&impl->diag); CeedChkBackend(ierr);
68 
69   ierr = CeedFree(&impl); CeedChkBackend(ierr);
70   return CEED_ERROR_SUCCESS;
71 }
72 
73 //------------------------------------------------------------------------------
74 // Setup infields or outfields
75 //------------------------------------------------------------------------------
76 static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op,
77                                         bool isinput, CeedVector *evecs,
78                                         CeedVector *qvecs, CeedInt starte,
79                                         CeedInt numfields, CeedInt Q,
80                                         CeedInt numelements) {
81   CeedInt dim, ierr, size;
82   Ceed ceed;
83   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
84   CeedBasis basis;
85   CeedElemRestriction Erestrict;
86   CeedOperatorField *opfields;
87   CeedQFunctionField *qffields;
88   CeedVector fieldvec;
89   bool strided;
90   bool skiprestrict;
91 
92   if (isinput) {
93     ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL);
94     CeedChkBackend(ierr);
95     ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL);
96     CeedChkBackend(ierr);
97   } else {
98     ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields);
99     CeedChkBackend(ierr);
100     ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields);
101     CeedChkBackend(ierr);
102   }
103 
104   // Loop over fields
105   for (CeedInt i = 0; i < numfields; i++) {
106     CeedEvalMode emode;
107     ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr);
108 
109     strided = false;
110     skiprestrict = false;
111     if (emode != CEED_EVAL_WEIGHT) {
112       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict);
113       CeedChkBackend(ierr);
114 
115       // Check whether this field can skip the element restriction:
116       // must be passive input, with emode NONE, and have a strided restriction with
117       // CEED_STRIDES_BACKEND.
118 
119       // First, check whether the field is input or output:
120       if (isinput) {
121         // Check for passive input:
122         ierr = CeedOperatorFieldGetVector(opfields[i], &fieldvec); CeedChkBackend(ierr);
123         if (fieldvec != CEED_VECTOR_ACTIVE) {
124           // Check emode
125           if (emode == CEED_EVAL_NONE) {
126             // Check for strided restriction
127             ierr = CeedElemRestrictionIsStrided(Erestrict, &strided);
128             CeedChkBackend(ierr);
129             if (strided) {
130               // Check if vector is already in preferred backend ordering
131               ierr = CeedElemRestrictionHasBackendStrides(Erestrict,
132                      &skiprestrict); CeedChkBackend(ierr);
133             }
134           }
135         }
136       }
137       if (skiprestrict) {
138         // We do not need an E-Vector, but will use the input field vector's data
139         // directly in the operator application.
140         evecs[i + starte] = NULL;
141       } else {
142         ierr = CeedElemRestrictionCreateVector(Erestrict, NULL,
143                                                &evecs[i + starte]);
144         CeedChkBackend(ierr);
145       }
146     }
147 
148     switch (emode) {
149     case CEED_EVAL_NONE:
150       ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr);
151       ierr = CeedVectorCreate(ceed, numelements * Q * size, &qvecs[i]);
152       CeedChkBackend(ierr);
153       break;
154     case CEED_EVAL_INTERP:
155       ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr);
156       ierr = CeedVectorCreate(ceed, numelements * Q * size, &qvecs[i]);
157       CeedChkBackend(ierr);
158       break;
159     case CEED_EVAL_GRAD:
160       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr);
161       ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr);
162       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
163       ierr = CeedVectorCreate(ceed, numelements * Q * size, &qvecs[i]);
164       CeedChkBackend(ierr);
165       break;
166     case CEED_EVAL_WEIGHT: // Only on input fields
167       ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr);
168       ierr = CeedVectorCreate(ceed, numelements * Q, &qvecs[i]); CeedChkBackend(ierr);
169       ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE,
170                             CEED_EVAL_WEIGHT, NULL, qvecs[i]); CeedChkBackend(ierr);
171       break;
172     case CEED_EVAL_DIV:
173       break; // TODO: Not implemented
174     case CEED_EVAL_CURL:
175       break; // TODO: Not implemented
176     }
177   }
178   return CEED_ERROR_SUCCESS;
179 }
180 
181 //------------------------------------------------------------------------------
182 // CeedOperator needs to connect all the named fields (be they active or passive)
183 //   to the named inputs and outputs of its CeedQFunction.
184 //------------------------------------------------------------------------------
185 static int CeedOperatorSetup_Cuda(CeedOperator op) {
186   int ierr;
187   bool setupdone;
188   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
189   if (setupdone)
190     return CEED_ERROR_SUCCESS;
191   Ceed ceed;
192   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
193   CeedOperator_Cuda *impl;
194   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
195   CeedQFunction qf;
196   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
197   CeedInt Q, numelements, numinputfields, numoutputfields;
198   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
199   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
200   CeedChkBackend(ierr);
201   CeedOperatorField *opinputfields, *opoutputfields;
202   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
203                                &numoutputfields, &opoutputfields);
204   CeedChkBackend(ierr);
205   CeedQFunctionField *qfinputfields, *qfoutputfields;
206   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
207   CeedChkBackend(ierr);
208 
209   // Allocate
210   ierr = CeedCalloc(numinputfields + numoutputfields, &impl->evecs);
211   CeedChkBackend(ierr);
212 
213   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin); CeedChkBackend(ierr);
214   ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout); CeedChkBackend(ierr);
215 
216   impl->numein = numinputfields; impl->numeout = numoutputfields;
217 
218   // Set up infield and outfield evecs and qvecs
219   // Infields
220   ierr = CeedOperatorSetupFields_Cuda(qf, op, true,
221                                       impl->evecs, impl->qvecsin, 0,
222                                       numinputfields, Q, numelements);
223   CeedChkBackend(ierr);
224 
225   // Outfields
226   ierr = CeedOperatorSetupFields_Cuda(qf, op, false,
227                                       impl->evecs, impl->qvecsout,
228                                       numinputfields, numoutputfields, Q,
229                                       numelements); CeedChkBackend(ierr);
230 
231   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
232   return CEED_ERROR_SUCCESS;
233 }
234 
235 //------------------------------------------------------------------------------
236 // Setup Operator Inputs
237 //------------------------------------------------------------------------------
238 static inline int CeedOperatorSetupInputs_Cuda(CeedInt numinputfields,
239     CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
240     CeedVector invec, const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX],
241     CeedOperator_Cuda *impl, CeedRequest *request) {
242   CeedInt ierr;
243   CeedEvalMode emode;
244   CeedVector vec;
245   CeedElemRestriction Erestrict;
246 
247   for (CeedInt i = 0; i < numinputfields; i++) {
248     // Get input vector
249     ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
250     if (vec == CEED_VECTOR_ACTIVE) {
251       if (skipactive)
252         continue;
253       else
254         vec = invec;
255     }
256 
257     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
258     CeedChkBackend(ierr);
259     if (emode == CEED_EVAL_WEIGHT) { // Skip
260     } else {
261       // Get input vector
262       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
263       // Get input element restriction
264       ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
265       CeedChkBackend(ierr);
266       if (vec == CEED_VECTOR_ACTIVE)
267         vec = invec;
268       // Restrict, if necessary
269       if (!impl->evecs[i]) {
270         // No restriction for this field; read data directly from vec.
271         ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE,
272                                       (const CeedScalar **) &edata[i]);
273         CeedChkBackend(ierr);
274       } else {
275         ierr = CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec,
276                                         impl->evecs[i], request); CeedChkBackend(ierr);
277         // Get evec
278         ierr = CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE,
279                                       (const CeedScalar **) &edata[i]);
280         CeedChkBackend(ierr);
281       }
282     }
283   }
284   return CEED_ERROR_SUCCESS;
285 }
286 
287 //------------------------------------------------------------------------------
288 // Input Basis Action
289 //------------------------------------------------------------------------------
290 static inline int CeedOperatorInputBasis_Cuda(CeedInt numelements,
291     CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
292     CeedInt numinputfields, const bool skipactive,
293     CeedScalar *edata[2*CEED_FIELD_MAX],CeedOperator_Cuda *impl) {
294   CeedInt ierr;
295   CeedInt elemsize, size;
296   CeedElemRestriction Erestrict;
297   CeedEvalMode emode;
298   CeedBasis basis;
299 
300   for (CeedInt i=0; i<numinputfields; i++) {
301     // Skip active input
302     if (skipactive) {
303       CeedVector vec;
304       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
305       if (vec == CEED_VECTOR_ACTIVE)
306         continue;
307     }
308     // Get elemsize, emode, size
309     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
310     CeedChkBackend(ierr);
311     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
312     CeedChkBackend(ierr);
313     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
314     CeedChkBackend(ierr);
315     ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr);
316     // Basis action
317     switch (emode) {
318     case CEED_EVAL_NONE:
319       ierr = CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE,
320                                 CEED_USE_POINTER, edata[i]); CeedChkBackend(ierr);
321       break;
322     case CEED_EVAL_INTERP:
323       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis);
324       CeedChkBackend(ierr);
325       ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE,
326                             CEED_EVAL_INTERP, impl->evecs[i],
327                             impl->qvecsin[i]); CeedChkBackend(ierr);
328       break;
329     case CEED_EVAL_GRAD:
330       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis);
331       CeedChkBackend(ierr);
332       ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE,
333                             CEED_EVAL_GRAD, impl->evecs[i],
334                             impl->qvecsin[i]); CeedChkBackend(ierr);
335       break;
336     case CEED_EVAL_WEIGHT:
337       break; // No action
338     case CEED_EVAL_DIV:
339       break; // TODO: Not implemented
340     case CEED_EVAL_CURL:
341       break; // TODO: Not implemented
342     }
343   }
344   return CEED_ERROR_SUCCESS;
345 }
346 
347 //------------------------------------------------------------------------------
348 // Restore Input Vectors
349 //------------------------------------------------------------------------------
350 static inline int CeedOperatorRestoreInputs_Cuda(CeedInt numinputfields,
351     CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields,
352     const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX],
353     CeedOperator_Cuda *impl) {
354   CeedInt ierr;
355   CeedEvalMode emode;
356   CeedVector vec;
357 
358   for (CeedInt i = 0; i < numinputfields; i++) {
359     // Skip active input
360     if (skipactive) {
361       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
362       if (vec == CEED_VECTOR_ACTIVE)
363         continue;
364     }
365     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
366     CeedChkBackend(ierr);
367     if (emode == CEED_EVAL_WEIGHT) { // Skip
368     } else {
369       if (!impl->evecs[i]) {  // This was a skiprestrict case
370         ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
371         ierr = CeedVectorRestoreArrayRead(vec,
372                                           (const CeedScalar **)&edata[i]);
373         CeedChkBackend(ierr);
374       } else {
375         ierr = CeedVectorRestoreArrayRead(impl->evecs[i],
376                                           (const CeedScalar **) &edata[i]);
377         CeedChkBackend(ierr);
378       }
379     }
380   }
381   return CEED_ERROR_SUCCESS;
382 }
383 
384 //------------------------------------------------------------------------------
385 // Apply and add to output
386 //------------------------------------------------------------------------------
387 static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector invec,
388                                      CeedVector outvec, CeedRequest *request) {
389   int ierr;
390   CeedOperator_Cuda *impl;
391   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
392   CeedQFunction qf;
393   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
394   CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size;
395   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
396   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
397   CeedChkBackend(ierr);
398   CeedOperatorField *opinputfields, *opoutputfields;
399   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
400                                &numoutputfields, &opoutputfields);
401   CeedChkBackend(ierr);
402   CeedQFunctionField *qfinputfields, *qfoutputfields;
403   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
404   CeedChkBackend(ierr);
405   CeedEvalMode emode;
406   CeedVector vec;
407   CeedBasis basis;
408   CeedElemRestriction Erestrict;
409   CeedScalar *edata[2*CEED_FIELD_MAX] = {0};
410 
411   // Setup
412   ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr);
413 
414   // Input Evecs and Restriction
415   ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields,
416                                       opinputfields, invec, false, edata,
417                                       impl, request); CeedChkBackend(ierr);
418 
419   // Input basis apply if needed
420   ierr = CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields,
421                                      numinputfields, false, edata, impl);
422   CeedChkBackend(ierr);
423 
424   // Output pointers, as necessary
425   for (CeedInt i = 0; i < numoutputfields; i++) {
426     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
427     CeedChkBackend(ierr);
428     if (emode == CEED_EVAL_NONE) {
429       // Set the output Q-Vector to use the E-Vector data directly.
430       ierr = CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE,
431                                      &edata[i + numinputfields]); CeedChkBackend(ierr);
432       ierr = CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE,
433                                 CEED_USE_POINTER, edata[i + numinputfields]);
434       CeedChkBackend(ierr);
435     }
436   }
437 
438   // Q function
439   ierr = CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout);
440   CeedChkBackend(ierr);
441 
442   // Output basis apply if needed
443   for (CeedInt i = 0; i < numoutputfields; i++) {
444     // Get elemsize, emode, size
445     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
446     CeedChkBackend(ierr);
447     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
448     CeedChkBackend(ierr);
449     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
450     CeedChkBackend(ierr);
451     ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size);
452     CeedChkBackend(ierr);
453     // Basis action
454     switch (emode) {
455     case CEED_EVAL_NONE:
456       break;
457     case CEED_EVAL_INTERP:
458       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
459       CeedChkBackend(ierr);
460       ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE,
461                             CEED_EVAL_INTERP, impl->qvecsout[i],
462                             impl->evecs[i + impl->numein]); CeedChkBackend(ierr);
463       break;
464     case CEED_EVAL_GRAD:
465       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis);
466       CeedChkBackend(ierr);
467       ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE,
468                             CEED_EVAL_GRAD, impl->qvecsout[i],
469                             impl->evecs[i + impl->numein]); CeedChkBackend(ierr);
470       break;
471     // LCOV_EXCL_START
472     case CEED_EVAL_WEIGHT: {
473       Ceed ceed;
474       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
475       return CeedError(ceed, CEED_ERROR_BACKEND,
476                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
477       break; // Should not occur
478     }
479     case CEED_EVAL_DIV:
480       break; // TODO: Not implemented
481     case CEED_EVAL_CURL:
482       break; // TODO: Not implemented
483       // LCOV_EXCL_STOP
484     }
485   }
486 
487   // Output restriction
488   for (CeedInt i = 0; i < numoutputfields; i++) {
489     // Restore evec
490     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
491     CeedChkBackend(ierr);
492     if (emode == CEED_EVAL_NONE) {
493       ierr = CeedVectorRestoreArray(impl->evecs[i+impl->numein],
494                                     &edata[i + numinputfields]);
495       CeedChkBackend(ierr);
496     }
497     // Get output vector
498     ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec);
499     CeedChkBackend(ierr);
500     // Restrict
501     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
502     CeedChkBackend(ierr);
503     // Active
504     if (vec == CEED_VECTOR_ACTIVE)
505       vec = outvec;
506 
507     ierr = CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE,
508                                     impl->evecs[i + impl->numein], vec,
509                                     request); CeedChkBackend(ierr);
510   }
511 
512   // Restore input arrays
513   ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields,
514                                         opinputfields, false, edata, impl);
515   CeedChkBackend(ierr);
516   return CEED_ERROR_SUCCESS;
517 }
518 
519 //------------------------------------------------------------------------------
520 // Core code for assembling linear QFunction
521 //------------------------------------------------------------------------------
522 static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
523     bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
524     CeedRequest *request) {
525   int ierr;
526   CeedOperator_Cuda *impl;
527   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
528   CeedQFunction qf;
529   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
530   CeedInt Q, numelements, numinputfields, numoutputfields, size;
531   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
532   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
533   CeedChkBackend(ierr);
534   CeedOperatorField *opinputfields, *opoutputfields;
535   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields,
536                                &numoutputfields, &opoutputfields);
537   CeedChkBackend(ierr);
538   CeedQFunctionField *qfinputfields, *qfoutputfields;
539   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
540   CeedChkBackend(ierr);
541   CeedVector vec;
542   CeedInt numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout;
543   CeedVector *activein = impl->qfactivein;
544   CeedScalar *a, *tmp;
545   Ceed ceed, ceedparent;
546   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
547   ierr = CeedGetOperatorFallbackParentCeed(ceed, &ceedparent);
548   CeedChkBackend(ierr);
549   ceedparent = ceedparent ? ceedparent : ceed;
550   CeedScalar *edata[2*CEED_FIELD_MAX];
551 
552   // Setup
553   ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr);
554 
555   // Check for identity
556   bool identityqf;
557   ierr = CeedQFunctionIsIdentity(qf, &identityqf); CeedChkBackend(ierr);
558   if (identityqf)
559     // LCOV_EXCL_START
560     return CeedError(ceed, CEED_ERROR_BACKEND,
561                      "Assembling identity QFunctions not supported");
562   // LCOV_EXCL_STOP
563 
564   // Input Evecs and Restriction
565   ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields,
566                                       opinputfields, NULL, true, edata,
567                                       impl, request); CeedChkBackend(ierr);
568 
569   // Count number of active input fields
570   if (!numactivein) {
571     for (CeedInt i=0; i<numinputfields; i++) {
572       // Get input vector
573       ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr);
574       // Check if active input
575       if (vec == CEED_VECTOR_ACTIVE) {
576         ierr = CeedQFunctionFieldGetSize(qfinputfields[i], &size); CeedChkBackend(ierr);
577         ierr = CeedVectorSetValue(impl->qvecsin[i], 0.0); CeedChkBackend(ierr);
578         ierr = CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp);
579         CeedChkBackend(ierr);
580         ierr = CeedRealloc(numactivein + size, &activein); CeedChkBackend(ierr);
581         for (CeedInt field = 0; field < size; field++) {
582           ierr = CeedVectorCreate(ceed, Q*numelements,
583                                   &activein[numactivein+field]); CeedChkBackend(ierr);
584           ierr = CeedVectorSetArray(activein[numactivein+field], CEED_MEM_DEVICE,
585                                     CEED_USE_POINTER, &tmp[field*Q*numelements]);
586           CeedChkBackend(ierr);
587         }
588         numactivein += size;
589         ierr = CeedVectorRestoreArray(impl->qvecsin[i], &tmp); CeedChkBackend(ierr);
590       }
591     }
592     impl->qfnumactivein = numactivein;
593     impl->qfactivein = activein;
594   }
595 
596   // Count number of active output fields
597   if (!numactiveout) {
598     for (CeedInt i=0; i<numoutputfields; i++) {
599       // Get output vector
600       ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec);
601       CeedChkBackend(ierr);
602       // Check if active output
603       if (vec == CEED_VECTOR_ACTIVE) {
604         ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size);
605         CeedChkBackend(ierr);
606         numactiveout += size;
607       }
608     }
609     impl->qfnumactiveout = numactiveout;
610   }
611 
612   // Check sizes
613   if (!numactivein || !numactiveout)
614     // LCOV_EXCL_START
615     return CeedError(ceed, CEED_ERROR_BACKEND,
616                      "Cannot assemble QFunction without active inputs "
617                      "and outputs");
618   // LCOV_EXCL_STOP
619 
620   // Build objects if needed
621   if (build_objects) {
622     // Create output restriction
623     CeedInt strides[3] = {1, numelements*Q, Q}; /* *NOPAD* */
624     ierr = CeedElemRestrictionCreateStrided(ceedparent, numelements, Q,
625                                             numactivein*numactiveout,
626                                             numactivein*numactiveout*numelements*Q,
627                                             strides, rstr); CeedChkBackend(ierr);
628     // Create assembled vector
629     ierr = CeedVectorCreate(ceedparent, numelements*Q*numactivein*numactiveout,
630                             assembled); CeedChkBackend(ierr);
631   }
632   ierr = CeedVectorSetValue(*assembled, 0.0); CeedChkBackend(ierr);
633   ierr = CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &a);
634   CeedChkBackend(ierr);
635 
636   // Input basis apply
637   ierr = CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields,
638                                      numinputfields, true, edata, impl);
639   CeedChkBackend(ierr);
640 
641   // Assemble QFunction
642   for (CeedInt in=0; in<numactivein; in++) {
643     // Set Inputs
644     ierr = CeedVectorSetValue(activein[in], 1.0); CeedChkBackend(ierr);
645     if (numactivein > 1) {
646       ierr = CeedVectorSetValue(activein[(in+numactivein-1)%numactivein],
647                                 0.0); CeedChkBackend(ierr);
648     }
649     // Set Outputs
650     for (CeedInt out=0; out<numoutputfields; out++) {
651       // Get output vector
652       ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec);
653       CeedChkBackend(ierr);
654       // Check if active output
655       if (vec == CEED_VECTOR_ACTIVE) {
656         CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE,
657                            CEED_USE_POINTER, a); CeedChkBackend(ierr);
658         ierr = CeedQFunctionFieldGetSize(qfoutputfields[out], &size);
659         CeedChkBackend(ierr);
660         a += size*Q*numelements; // Advance the pointer by the size of the output
661       }
662     }
663     // Apply QFunction
664     ierr = CeedQFunctionApply(qf, Q*numelements, impl->qvecsin, impl->qvecsout);
665     CeedChkBackend(ierr);
666   }
667 
668   // Un-set output Qvecs to prevent accidental overwrite of Assembled
669   for (CeedInt out=0; out<numoutputfields; out++) {
670     // Get output vector
671     ierr = CeedOperatorFieldGetVector(opoutputfields[out], &vec);
672     CeedChkBackend(ierr);
673     // Check if active output
674     if (vec == CEED_VECTOR_ACTIVE) {
675       ierr = CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL);
676       CeedChkBackend(ierr);
677     }
678   }
679 
680   // Restore input arrays
681   ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields,
682                                         opinputfields, true, edata, impl);
683   CeedChkBackend(ierr);
684 
685   // Restore output
686   ierr = CeedVectorRestoreArray(*assembled, &a); CeedChkBackend(ierr);
687 
688   return CEED_ERROR_SUCCESS;
689 }
690 
691 //------------------------------------------------------------------------------
692 // Assemble Linear QFunction
693 //------------------------------------------------------------------------------
694 static int CeedOperatorLinearAssembleQFunction_Cuda(CeedOperator op,
695     CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
696   return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, true, assembled, rstr,
697          request);
698 }
699 
700 //------------------------------------------------------------------------------
701 // Update Assembled Linear QFunction
702 //------------------------------------------------------------------------------
703 static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op,
704     CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
705   return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, false, &assembled,
706          &rstr, request);
707 }
708 
709 //------------------------------------------------------------------------------
710 // Diagonal assembly kernels
711 //------------------------------------------------------------------------------
712 // *INDENT-OFF*
713 static const char *diagonalkernels = QUOTE(
714 
715 typedef enum {
716   /// Perform no evaluation (either because there is no data or it is already at
717   /// quadrature points)
718   CEED_EVAL_NONE   = 0,
719   /// Interpolate from nodes to quadrature points
720   CEED_EVAL_INTERP = 1,
721   /// Evaluate gradients at quadrature points from input in a nodal basis
722   CEED_EVAL_GRAD   = 2,
723   /// Evaluate divergence at quadrature points from input in a nodal basis
724   CEED_EVAL_DIV    = 4,
725   /// Evaluate curl at quadrature points from input in a nodal basis
726   CEED_EVAL_CURL   = 8,
727   /// Using no input, evaluate quadrature weights on the reference element
728   CEED_EVAL_WEIGHT = 16,
729 } CeedEvalMode;
730 
731 //------------------------------------------------------------------------------
732 // Get Basis Emode Pointer
733 //------------------------------------------------------------------------------
734 extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **basisptr,
735     CeedEvalMode emode, const CeedScalar *identity, const CeedScalar *interp,
736     const CeedScalar *grad) {
737   switch (emode) {
738   case CEED_EVAL_NONE:
739     *basisptr = identity;
740     break;
741   case CEED_EVAL_INTERP:
742     *basisptr = interp;
743     break;
744   case CEED_EVAL_GRAD:
745     *basisptr = grad;
746     break;
747   case CEED_EVAL_WEIGHT:
748   case CEED_EVAL_DIV:
749   case CEED_EVAL_CURL:
750     break; // Caught by QF Assembly
751   }
752 }
753 
754 //------------------------------------------------------------------------------
755 // Core code for diagonal assembly
756 //------------------------------------------------------------------------------
757 __device__ void diagonalCore(const CeedInt nelem,
758     const CeedScalar maxnorm, const bool pointBlock,
759     const CeedScalar *identity,
760     const CeedScalar *interpin, const CeedScalar *gradin,
761     const CeedScalar *interpout, const CeedScalar *gradout,
762     const CeedEvalMode *emodein, const CeedEvalMode *emodeout,
763     const CeedScalar *__restrict__ assembledqfarray,
764     CeedScalar *__restrict__ elemdiagarray) {
765   const int tid = threadIdx.x; // running with P threads, tid is evec node
766   const CeedScalar qfvaluebound = maxnorm*1e-12;
767 
768   // Compute the diagonal of B^T D B
769   // Each element
770   for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < nelem;
771        e += gridDim.x*blockDim.z) {
772     CeedInt dout = -1;
773     // Each basis eval mode pair
774     for (CeedInt eout = 0; eout < NUMEMODEOUT; eout++) {
775       const CeedScalar *bt = NULL;
776       if (emodeout[eout] == CEED_EVAL_GRAD)
777         dout += 1;
778       CeedOperatorGetBasisPointer_Cuda(&bt, emodeout[eout], identity, interpout,
779                                       &gradout[dout*NQPTS*NNODES]);
780       CeedInt din = -1;
781       for (CeedInt ein = 0; ein < NUMEMODEIN; ein++) {
782         const CeedScalar *b = NULL;
783         if (emodein[ein] == CEED_EVAL_GRAD)
784           din += 1;
785         CeedOperatorGetBasisPointer_Cuda(&b, emodein[ein], identity, interpin,
786                                         &gradin[din*NQPTS*NNODES]);
787         // Each component
788         for (CeedInt compOut = 0; compOut < NCOMP; compOut++) {
789           // Each qpoint/node pair
790           if (pointBlock) {
791             // Point Block Diagonal
792             for (CeedInt compIn = 0; compIn < NCOMP; compIn++) {
793               CeedScalar evalue = 0.;
794               for (CeedInt q = 0; q < NQPTS; q++) {
795                 const CeedScalar qfvalue =
796                   assembledqfarray[((((ein*NCOMP+compIn)*NUMEMODEOUT+eout)*
797                                      NCOMP+compOut)*nelem+e)*NQPTS+q];
798                 if (abs(qfvalue) > qfvaluebound)
799                   evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid];
800               }
801               elemdiagarray[((compOut*NCOMP+compIn)*nelem+e)*NNODES+tid] += evalue;
802             }
803           } else {
804             // Diagonal Only
805             CeedScalar evalue = 0.;
806             for (CeedInt q = 0; q < NQPTS; q++) {
807               const CeedScalar qfvalue =
808                 assembledqfarray[((((ein*NCOMP+compOut)*NUMEMODEOUT+eout)*
809                                    NCOMP+compOut)*nelem+e)*NQPTS+q];
810               if (abs(qfvalue) > qfvaluebound)
811                 evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid];
812             }
813             elemdiagarray[(compOut*nelem+e)*NNODES+tid] += evalue;
814           }
815         }
816       }
817     }
818   }
819 }
820 
821 //------------------------------------------------------------------------------
822 // Linear diagonal
823 //------------------------------------------------------------------------------
824 extern "C" __global__ void linearDiagonal(const CeedInt nelem,
825     const CeedScalar maxnorm, const CeedScalar *identity,
826     const CeedScalar *interpin, const CeedScalar *gradin,
827     const CeedScalar *interpout, const CeedScalar *gradout,
828     const CeedEvalMode *emodein, const CeedEvalMode *emodeout,
829     const CeedScalar *__restrict__ assembledqfarray,
830     CeedScalar *__restrict__ elemdiagarray) {
831   diagonalCore(nelem, maxnorm, false, identity, interpin, gradin, interpout,
832                gradout, emodein, emodeout, assembledqfarray, elemdiagarray);
833 }
834 
835 //------------------------------------------------------------------------------
836 // Linear point block diagonal
837 //------------------------------------------------------------------------------
838 extern "C" __global__ void linearPointBlockDiagonal(const CeedInt nelem,
839     const CeedScalar maxnorm, const CeedScalar *identity,
840     const CeedScalar *interpin, const CeedScalar *gradin,
841     const CeedScalar *interpout, const CeedScalar *gradout,
842     const CeedEvalMode *emodein, const CeedEvalMode *emodeout,
843     const CeedScalar *__restrict__ assembledqfarray,
844     CeedScalar *__restrict__ elemdiagarray) {
845   diagonalCore(nelem, maxnorm, true, identity, interpin, gradin, interpout,
846                gradout, emodein, emodeout, assembledqfarray, elemdiagarray);
847 }
848 
849 );
850 // *INDENT-ON*
851 
852 //------------------------------------------------------------------------------
853 // Create point block restriction
854 //------------------------------------------------------------------------------
855 static int CreatePBRestriction(CeedElemRestriction rstr,
856                                CeedElemRestriction *pbRstr) {
857   int ierr;
858   Ceed ceed;
859   ierr = CeedElemRestrictionGetCeed(rstr, &ceed); CeedChkBackend(ierr);
860   const CeedInt *offsets;
861   ierr = CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets);
862   CeedChkBackend(ierr);
863 
864   // Expand offsets
865   CeedInt nelem, ncomp, elemsize, compstride, max = 1, *pbOffsets;
866   ierr = CeedElemRestrictionGetNumElements(rstr, &nelem); CeedChkBackend(ierr);
867   ierr = CeedElemRestrictionGetNumComponents(rstr, &ncomp); CeedChkBackend(ierr);
868   ierr = CeedElemRestrictionGetElementSize(rstr, &elemsize); CeedChkBackend(ierr);
869   ierr = CeedElemRestrictionGetCompStride(rstr, &compstride);
870   CeedChkBackend(ierr);
871   CeedInt shift = ncomp;
872   if (compstride != 1)
873     shift *= ncomp;
874   ierr = CeedCalloc(nelem*elemsize, &pbOffsets); CeedChkBackend(ierr);
875   for (CeedInt i = 0; i < nelem*elemsize; i++) {
876     pbOffsets[i] = offsets[i]*shift;
877     if (pbOffsets[i] > max)
878       max = pbOffsets[i];
879   }
880 
881   // Create new restriction
882   ierr = CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp*ncomp, 1,
883                                    max + ncomp*ncomp, CEED_MEM_HOST,
884                                    CEED_OWN_POINTER, pbOffsets, pbRstr);
885   CeedChkBackend(ierr);
886 
887   // Cleanup
888   ierr = CeedElemRestrictionRestoreOffsets(rstr, &offsets); CeedChkBackend(ierr);
889 
890   return CEED_ERROR_SUCCESS;
891 }
892 
893 //------------------------------------------------------------------------------
894 // Assemble diagonal setup
895 //------------------------------------------------------------------------------
896 static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op,
897     const bool pointBlock) {
898   int ierr;
899   Ceed ceed;
900   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
901   CeedQFunction qf;
902   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
903   CeedInt numinputfields, numoutputfields;
904   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
905   CeedChkBackend(ierr);
906 
907   // Determine active input basis
908   CeedOperatorField *opfields;
909   CeedQFunctionField *qffields;
910   ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL);
911   CeedChkBackend(ierr);
912   ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL);
913   CeedChkBackend(ierr);
914   CeedInt numemodein = 0, ncomp = 0, dim = 1;
915   CeedEvalMode *emodein = NULL;
916   CeedBasis basisin = NULL;
917   CeedElemRestriction rstrin = NULL;
918   for (CeedInt i = 0; i < numinputfields; i++) {
919     CeedVector vec;
920     ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr);
921     if (vec == CEED_VECTOR_ACTIVE) {
922       CeedElemRestriction rstr;
923       ierr = CeedOperatorFieldGetBasis(opfields[i], &basisin); CeedChkBackend(ierr);
924       ierr = CeedBasisGetNumComponents(basisin, &ncomp); CeedChkBackend(ierr);
925       ierr = CeedBasisGetDimension(basisin, &dim); CeedChkBackend(ierr);
926       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr);
927       CeedChkBackend(ierr);
928       if (rstrin && rstrin != rstr)
929         // LCOV_EXCL_START
930         return CeedError(ceed, CEED_ERROR_BACKEND,
931                          "Multi-field non-composite operator diagonal assembly not supported");
932       // LCOV_EXCL_STOP
933       rstrin = rstr;
934       CeedEvalMode emode;
935       ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode);
936       CeedChkBackend(ierr);
937       switch (emode) {
938       case CEED_EVAL_NONE:
939       case CEED_EVAL_INTERP:
940         ierr = CeedRealloc(numemodein + 1, &emodein); CeedChkBackend(ierr);
941         emodein[numemodein] = emode;
942         numemodein += 1;
943         break;
944       case CEED_EVAL_GRAD:
945         ierr = CeedRealloc(numemodein + dim, &emodein); CeedChkBackend(ierr);
946         for (CeedInt d = 0; d < dim; d++)
947           emodein[numemodein+d] = emode;
948         numemodein += dim;
949         break;
950       case CEED_EVAL_WEIGHT:
951       case CEED_EVAL_DIV:
952       case CEED_EVAL_CURL:
953         break; // Caught by QF Assembly
954       }
955     }
956   }
957 
958   // Determine active output basis
959   ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields);
960   CeedChkBackend(ierr);
961   ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields);
962   CeedChkBackend(ierr);
963   CeedInt numemodeout = 0;
964   CeedEvalMode *emodeout = NULL;
965   CeedBasis basisout = NULL;
966   CeedElemRestriction rstrout = NULL;
967   for (CeedInt i = 0; i < numoutputfields; i++) {
968     CeedVector vec;
969     ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr);
970     if (vec == CEED_VECTOR_ACTIVE) {
971       CeedElemRestriction rstr;
972       ierr = CeedOperatorFieldGetBasis(opfields[i], &basisout); CeedChkBackend(ierr);
973       ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr);
974       CeedChkBackend(ierr);
975       if (rstrout && rstrout != rstr)
976         // LCOV_EXCL_START
977         return CeedError(ceed, CEED_ERROR_BACKEND,
978                          "Multi-field non-composite operator diagonal assembly not supported");
979       // LCOV_EXCL_STOP
980       rstrout = rstr;
981       CeedEvalMode emode;
982       ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr);
983       switch (emode) {
984       case CEED_EVAL_NONE:
985       case CEED_EVAL_INTERP:
986         ierr = CeedRealloc(numemodeout + 1, &emodeout); CeedChkBackend(ierr);
987         emodeout[numemodeout] = emode;
988         numemodeout += 1;
989         break;
990       case CEED_EVAL_GRAD:
991         ierr = CeedRealloc(numemodeout + dim, &emodeout); CeedChkBackend(ierr);
992         for (CeedInt d = 0; d < dim; d++)
993           emodeout[numemodeout+d] = emode;
994         numemodeout += dim;
995         break;
996       case CEED_EVAL_WEIGHT:
997       case CEED_EVAL_DIV:
998       case CEED_EVAL_CURL:
999         break; // Caught by QF Assembly
1000       }
1001     }
1002   }
1003 
1004   // Operator data struct
1005   CeedOperator_Cuda *impl;
1006   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1007   ierr = CeedCalloc(1, &impl->diag); CeedChkBackend(ierr);
1008   CeedOperatorDiag_Cuda *diag = impl->diag;
1009   diag->basisin = basisin;
1010   diag->basisout = basisout;
1011   diag->h_emodein = emodein;
1012   diag->h_emodeout = emodeout;
1013   diag->numemodein = numemodein;
1014   diag->numemodeout = numemodeout;
1015 
1016   // Assemble kernel
1017   CeedInt nnodes, nqpts;
1018   ierr = CeedBasisGetNumNodes(basisin, &nnodes); CeedChkBackend(ierr);
1019   ierr = CeedBasisGetNumQuadraturePoints(basisin, &nqpts); CeedChkBackend(ierr);
1020   diag->nnodes = nnodes;
1021   ierr = CeedCompileCuda(ceed, diagonalkernels, &diag->module, 5,
1022                          "NUMEMODEIN", numemodein,
1023                          "NUMEMODEOUT", numemodeout,
1024                          "NNODES", nnodes,
1025                          "NQPTS", nqpts,
1026                          "NCOMP", ncomp
1027                         ); CeedChk_Cu(ceed, ierr);
1028   ierr = CeedGetKernelCuda(ceed, diag->module, "linearDiagonal",
1029                            &diag->linearDiagonal); CeedChk_Cu(ceed, ierr);
1030   ierr = CeedGetKernelCuda(ceed, diag->module, "linearPointBlockDiagonal",
1031                            &diag->linearPointBlock);
1032   CeedChk_Cu(ceed, ierr);
1033 
1034   // Basis matrices
1035   const CeedInt qBytes = nqpts * sizeof(CeedScalar);
1036   const CeedInt iBytes = qBytes * nnodes;
1037   const CeedInt gBytes = qBytes * nnodes * dim;
1038   const CeedInt eBytes = sizeof(CeedEvalMode);
1039   const CeedScalar *interpin, *interpout, *gradin, *gradout;
1040 
1041   // CEED_EVAL_NONE
1042   CeedScalar *identity = NULL;
1043   bool evalNone = false;
1044   for (CeedInt i=0; i<numemodein; i++)
1045     evalNone = evalNone || (emodein[i] == CEED_EVAL_NONE);
1046   for (CeedInt i=0; i<numemodeout; i++)
1047     evalNone = evalNone || (emodeout[i] == CEED_EVAL_NONE);
1048   if (evalNone) {
1049     ierr = CeedCalloc(nqpts*nnodes, &identity); CeedChkBackend(ierr);
1050     for (CeedInt i=0; i<(nnodes<nqpts?nnodes:nqpts); i++)
1051       identity[i*nnodes+i] = 1.0;
1052     ierr = cudaMalloc((void **)&diag->d_identity, iBytes); CeedChk_Cu(ceed, ierr);
1053     ierr = cudaMemcpy(diag->d_identity, identity, iBytes,
1054                       cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1055   }
1056 
1057   // CEED_EVAL_INTERP
1058   ierr = CeedBasisGetInterp(basisin, &interpin); CeedChkBackend(ierr);
1059   ierr = cudaMalloc((void **)&diag->d_interpin, iBytes); CeedChk_Cu(ceed, ierr);
1060   ierr = cudaMemcpy(diag->d_interpin, interpin, iBytes,
1061                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1062   ierr = CeedBasisGetInterp(basisout, &interpout); CeedChkBackend(ierr);
1063   ierr = cudaMalloc((void **)&diag->d_interpout, iBytes); CeedChk_Cu(ceed, ierr);
1064   ierr = cudaMemcpy(diag->d_interpout, interpout, iBytes,
1065                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1066 
1067   // CEED_EVAL_GRAD
1068   ierr = CeedBasisGetGrad(basisin, &gradin); CeedChkBackend(ierr);
1069   ierr = cudaMalloc((void **)&diag->d_gradin, gBytes); CeedChk_Cu(ceed, ierr);
1070   ierr = cudaMemcpy(diag->d_gradin, gradin, gBytes,
1071                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1072   ierr = CeedBasisGetGrad(basisout, &gradout); CeedChkBackend(ierr);
1073   ierr = cudaMalloc((void **)&diag->d_gradout, gBytes); CeedChk_Cu(ceed, ierr);
1074   ierr = cudaMemcpy(diag->d_gradout, gradout, gBytes,
1075                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1076 
1077   // Arrays of emodes
1078   ierr = cudaMalloc((void **)&diag->d_emodein, numemodein * eBytes);
1079   CeedChk_Cu(ceed, ierr);
1080   ierr = cudaMemcpy(diag->d_emodein, emodein, numemodein * eBytes,
1081                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1082   ierr = cudaMalloc((void **)&diag->d_emodeout, numemodeout * eBytes);
1083   CeedChk_Cu(ceed, ierr);
1084   ierr = cudaMemcpy(diag->d_emodeout, emodeout, numemodeout * eBytes,
1085                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
1086 
1087   // Restriction
1088   diag->diagrstr = rstrout;
1089 
1090   return CEED_ERROR_SUCCESS;
1091 }
1092 
1093 //------------------------------------------------------------------------------
1094 // Assemble diagonal common code
1095 //------------------------------------------------------------------------------
1096 static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op,
1097     CeedVector assembled, CeedRequest *request, const bool pointBlock) {
1098   int ierr;
1099   Ceed ceed;
1100   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1101   CeedOperator_Cuda *impl;
1102   ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr);
1103 
1104   // Assemble QFunction
1105   CeedVector assembledqf;
1106   CeedElemRestriction rstr;
1107   ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf,
1108          &rstr, request); CeedChkBackend(ierr);
1109   ierr = CeedElemRestrictionDestroy(&rstr); CeedChkBackend(ierr);
1110   CeedScalar maxnorm = 0;
1111   ierr = CeedVectorNorm(assembledqf, CEED_NORM_MAX, &maxnorm);
1112   CeedChkBackend(ierr);
1113 
1114   // Setup
1115   if (!impl->diag) {
1116     ierr = CeedOperatorAssembleDiagonalSetup_Cuda(op, pointBlock);
1117     CeedChkBackend(ierr);
1118   }
1119   CeedOperatorDiag_Cuda *diag = impl->diag;
1120   assert(diag != NULL);
1121 
1122   // Restriction
1123   if (pointBlock && !diag->pbdiagrstr) {
1124     CeedElemRestriction pbdiagrstr;
1125     ierr = CreatePBRestriction(diag->diagrstr, &pbdiagrstr); CeedChkBackend(ierr);
1126     diag->pbdiagrstr = pbdiagrstr;
1127   }
1128   CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr;
1129 
1130   // Create diagonal vector
1131   CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag;
1132   if (!elemdiag) {
1133     ierr = CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag);
1134     CeedChkBackend(ierr);
1135     if (pointBlock)
1136       diag->pbelemdiag = elemdiag;
1137     else
1138       diag->elemdiag = elemdiag;
1139   }
1140   ierr = CeedVectorSetValue(elemdiag, 0.0); CeedChkBackend(ierr);
1141 
1142   // Assemble element operator diagonals
1143   CeedScalar *elemdiagarray;
1144   const CeedScalar *assembledqfarray;
1145   ierr = CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray);
1146   CeedChkBackend(ierr);
1147   ierr = CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray);
1148   CeedChkBackend(ierr);
1149   CeedInt nelem;
1150   ierr = CeedElemRestrictionGetNumElements(diagrstr, &nelem);
1151   CeedChkBackend(ierr);
1152 
1153   // Compute the diagonal of B^T D B
1154   int elemsPerBlock = 1;
1155   int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0);
1156   void *args[] = {(void *) &nelem, (void *) &maxnorm, &diag->d_identity,
1157                   &diag->d_interpin, &diag->d_gradin, &diag->d_interpout,
1158                   &diag->d_gradout, &diag->d_emodein, &diag->d_emodeout,
1159                   &assembledqfarray, &elemdiagarray
1160                  };
1161   if (pointBlock) {
1162     ierr = CeedRunKernelDimCuda(ceed, diag->linearPointBlock, grid,
1163                                 diag->nnodes, 1, elemsPerBlock, args);
1164     CeedChkBackend(ierr);
1165   } else {
1166     ierr = CeedRunKernelDimCuda(ceed, diag->linearDiagonal, grid,
1167                                 diag->nnodes, 1, elemsPerBlock, args);
1168     CeedChkBackend(ierr);
1169   }
1170 
1171   // Restore arrays
1172   ierr = CeedVectorRestoreArray(elemdiag, &elemdiagarray); CeedChkBackend(ierr);
1173   ierr = CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray);
1174   CeedChkBackend(ierr);
1175 
1176   // Assemble local operator diagonal
1177   ierr = CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag,
1178                                   assembled, request); CeedChkBackend(ierr);
1179 
1180   // Cleanup
1181   ierr = CeedVectorDestroy(&assembledqf); CeedChkBackend(ierr);
1182 
1183   return CEED_ERROR_SUCCESS;
1184 }
1185 
1186 //------------------------------------------------------------------------------
1187 // Assemble composite diagonal common code
1188 //------------------------------------------------------------------------------
1189 static inline int CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda(
1190   CeedOperator op, CeedVector assembled, CeedRequest *request,
1191   const bool pointBlock) {
1192   int ierr;
1193   CeedInt numSub;
1194   CeedOperator *subOperators;
1195   ierr = CeedOperatorGetNumSub(op, &numSub); CeedChkBackend(ierr);
1196   ierr = CeedOperatorGetSubList(op, &subOperators); CeedChkBackend(ierr);
1197   for (CeedInt i = 0; i < numSub; i++) {
1198     ierr = CeedOperatorAssembleDiagonalCore_Cuda(subOperators[i], assembled,
1199            request, pointBlock); CeedChkBackend(ierr);
1200   }
1201   return CEED_ERROR_SUCCESS;
1202 }
1203 
1204 //------------------------------------------------------------------------------
1205 // Assemble Linear Diagonal
1206 //------------------------------------------------------------------------------
1207 static int CeedOperatorLinearAssembleAddDiagonal_Cuda(CeedOperator op,
1208     CeedVector assembled, CeedRequest *request) {
1209   int ierr;
1210   bool isComposite;
1211   ierr = CeedOperatorIsComposite(op, &isComposite); CeedChkBackend(ierr);
1212   if (isComposite) {
1213     return CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda(op, assembled,
1214            request, false);
1215   } else {
1216     return CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, false);
1217   }
1218 }
1219 
1220 //------------------------------------------------------------------------------
1221 // Assemble Linear Point Block Diagonal
1222 //------------------------------------------------------------------------------
1223 static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op,
1224     CeedVector assembled, CeedRequest *request) {
1225   int ierr;
1226   bool isComposite;
1227   ierr = CeedOperatorIsComposite(op, &isComposite); CeedChkBackend(ierr);
1228   if (isComposite) {
1229     return CeedOperatorLinearAssembleAddDiagonalCompositeCore_Cuda(op, assembled,
1230            request, true);
1231   } else {
1232     return CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, true);
1233   }
1234 }
1235 
1236 //------------------------------------------------------------------------------
1237 // Create operator
1238 //------------------------------------------------------------------------------
1239 int CeedOperatorCreate_Cuda(CeedOperator op) {
1240   int ierr;
1241   Ceed ceed;
1242   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1243   CeedOperator_Cuda *impl;
1244 
1245   ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr);
1246   ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr);
1247 
1248   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction",
1249                                 CeedOperatorLinearAssembleQFunction_Cuda);
1250   CeedChkBackend(ierr);
1251   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1252                                 "LinearAssembleQFunctionUpdate",
1253                                 CeedOperatorLinearAssembleQFunctionUpdate_Cuda);
1254   CeedChkBackend(ierr);
1255   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal",
1256                                 CeedOperatorLinearAssembleAddDiagonal_Cuda);
1257   CeedChkBackend(ierr);
1258   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1259                                 "LinearAssembleAddPointBlockDiagonal",
1260                                 CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda);
1261   CeedChkBackend(ierr);
1262   ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd",
1263                                 CeedOperatorApplyAdd_Cuda); CeedChkBackend(ierr);
1264   ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy",
1265                                 CeedOperatorDestroy_Cuda); CeedChkBackend(ierr);
1266   return CEED_ERROR_SUCCESS;
1267 }
1268 
1269 //------------------------------------------------------------------------------
1270 // Composite Operator Create
1271 //------------------------------------------------------------------------------
1272 int CeedCompositeOperatorCreate_Cuda(CeedOperator op) {
1273   int ierr;
1274   Ceed ceed;
1275   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1276 
1277   ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal",
1278                                 CeedOperatorLinearAssembleAddDiagonal_Cuda);
1279   CeedChkBackend(ierr);
1280   ierr = CeedSetBackendFunction(ceed, "Operator", op,
1281                                 "LinearAssembleAddPointBlockDiagonal",
1282                                 CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda);
1283   CeedChkBackend(ierr);
1284   return CEED_ERROR_SUCCESS;
1285 }
1286 //------------------------------------------------------------------------------
1287