xref: /libCEED/backends/cuda-ref/ceed-cuda-ref-operator.c (revision 1ce8139f46e7307779d69b9ddba9e4b0375a4d52)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include <stdbool.h>
15 #include <string.h>
16 
17 #include "../cuda/ceed-cuda-common.h"
18 #include "../cuda/ceed-cuda-compile.h"
19 #include "ceed-cuda-ref.h"
20 
21 //------------------------------------------------------------------------------
22 // Destroy operator
23 //------------------------------------------------------------------------------
24 static int CeedOperatorDestroy_Cuda(CeedOperator op) {
25   CeedOperator_Cuda *impl;
26 
27   CeedCallBackend(CeedOperatorGetData(op, &impl));
28 
29   // Apply data
30   for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
31     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
32   }
33   CeedCallBackend(CeedFree(&impl->e_vecs));
34 
35   for (CeedInt i = 0; i < impl->num_inputs; i++) {
36     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i]));
37   }
38   CeedCallBackend(CeedFree(&impl->q_vecs_in));
39 
40   for (CeedInt i = 0; i < impl->num_outputs; i++) {
41     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
42   }
43   CeedCallBackend(CeedFree(&impl->q_vecs_out));
44 
45   // QFunction assembly data
46   for (CeedInt i = 0; i < impl->num_active_in; i++) {
47     CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i]));
48   }
49   CeedCallBackend(CeedFree(&impl->qf_active_in));
50 
51   // Diag data
52   if (impl->diag) {
53     Ceed ceed;
54 
55     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
56     CeedCallCuda(ceed, cuModuleUnload(impl->diag->module));
57     CeedCallBackend(CeedFree(&impl->diag->h_e_mode_in));
58     CeedCallBackend(CeedFree(&impl->diag->h_e_mode_out));
59     CeedCallCuda(ceed, cudaFree(impl->diag->d_e_mode_in));
60     CeedCallCuda(ceed, cudaFree(impl->diag->d_e_mode_out));
61     CeedCallCuda(ceed, cudaFree(impl->diag->d_identity));
62     CeedCallCuda(ceed, cudaFree(impl->diag->d_interp_in));
63     CeedCallCuda(ceed, cudaFree(impl->diag->d_interp_out));
64     CeedCallCuda(ceed, cudaFree(impl->diag->d_grad_in));
65     CeedCallCuda(ceed, cudaFree(impl->diag->d_grad_out));
66     CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
67     CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag));
68     CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag));
69   }
70   CeedCallBackend(CeedFree(&impl->diag));
71 
72   if (impl->asmb) {
73     Ceed ceed;
74 
75     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
76     CeedCallCuda(ceed, cuModuleUnload(impl->asmb->module));
77     CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_in));
78     CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_out));
79   }
80   CeedCallBackend(CeedFree(&impl->asmb));
81 
82   CeedCallBackend(CeedFree(&impl));
83   return CEED_ERROR_SUCCESS;
84 }
85 
86 //------------------------------------------------------------------------------
87 // Setup infields or outfields
88 //------------------------------------------------------------------------------
89 static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt e_start,
90                                         CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
91   Ceed                ceed;
92   bool                is_strided, skip_restriction;
93   CeedSize            q_size;
94   CeedInt             dim, size;
95   CeedQFunctionField *qf_fields;
96   CeedOperatorField  *op_fields;
97 
98   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
99 
100   if (is_input) {
101     CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
102     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
103   } else {
104     CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
105     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
106   }
107 
108   // Loop over fields
109   for (CeedInt i = 0; i < num_fields; i++) {
110     CeedEvalMode e_mode;
111     CeedBasis    basis;
112 
113     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode));
114 
115     is_strided       = false;
116     skip_restriction = false;
117     if (e_mode != CEED_EVAL_WEIGHT) {
118       CeedElemRestriction elem_rstr;
119 
120       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
121 
122       // Check whether this field can skip the element restriction:
123       // must be passive input, with e_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND.
124 
125       // First, check whether the field is input or output:
126       if (is_input) {
127         CeedVector vec;
128 
129         // Check for passive input:
130         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
131         if (vec != CEED_VECTOR_ACTIVE) {
132           // Check e_mode
133           if (e_mode == CEED_EVAL_NONE) {
134             // Check for strided restriction
135             CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
136             if (is_strided) {
137               // Check if vector is already in preferred backend ordering
138               CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction));
139             }
140           }
141         }
142       }
143       if (skip_restriction) {
144         // We do not need an E-Vector, but will use the input field vector's data directly in the operator application.
145         e_vecs[i + e_start] = NULL;
146       } else {
147         CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + e_start]));
148       }
149     }
150 
151     switch (e_mode) {
152       case CEED_EVAL_NONE:
153         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
154         q_size = (CeedSize)num_elem * Q * size;
155         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
156         break;
157       case CEED_EVAL_INTERP:
158         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
159         q_size = (CeedSize)num_elem * Q * size;
160         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
161         break;
162       case CEED_EVAL_GRAD:
163         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
164         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
165         CeedCallBackend(CeedBasisGetDimension(basis, &dim));
166         q_size = (CeedSize)num_elem * Q * size;
167         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
168         break;
169       case CEED_EVAL_WEIGHT:  // Only on input fields
170         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
171         q_size = (CeedSize)num_elem * Q;
172         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
173         CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
174         break;
175       case CEED_EVAL_DIV:
176         break;  // TODO: Not implemented
177       case CEED_EVAL_CURL:
178         break;  // TODO: Not implemented
179     }
180   }
181   return CEED_ERROR_SUCCESS;
182 }
183 
184 //------------------------------------------------------------------------------
185 // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
186 //------------------------------------------------------------------------------
187 static int CeedOperatorSetup_Cuda(CeedOperator op) {
188   Ceed                ceed;
189   bool                is_setup_done;
190   CeedInt             Q, num_elem, num_input_fields, num_output_fields;
191   CeedQFunctionField *qf_input_fields, *qf_output_fields;
192   CeedQFunction       qf;
193   CeedOperatorField  *op_input_fields, *op_output_fields;
194   CeedOperator_Cuda  *impl;
195 
196   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
197   if (is_setup_done) return CEED_ERROR_SUCCESS;
198 
199   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
200   CeedCallBackend(CeedOperatorGetData(op, &impl));
201   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
202   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
203   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
204   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
205   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
206 
207   // Allocate
208   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
209 
210   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
211   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
212 
213   impl->num_inputs  = num_input_fields;
214   impl->num_outputs = num_output_fields;
215 
216   // Set up infield and outfield e_vecs and q_vecs
217   // Infields
218   CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
219   // Outfields
220   CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
221 
222   CeedCallBackend(CeedOperatorSetSetupDone(op));
223   return CEED_ERROR_SUCCESS;
224 }
225 
226 //------------------------------------------------------------------------------
227 // Setup Operator Inputs
228 //------------------------------------------------------------------------------
229 static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
230                                                CeedVector in_vec, const bool skip_active_in, CeedScalar *e_data[2 * CEED_FIELD_MAX],
231                                                CeedOperator_Cuda *impl, CeedRequest *request) {
232   for (CeedInt i = 0; i < num_input_fields; i++) {
233     CeedEvalMode        e_mode;
234     CeedVector          vec;
235     CeedElemRestriction elem_rstr;
236 
237     // Get input vector
238     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
239     if (vec == CEED_VECTOR_ACTIVE) {
240       if (skip_active_in) continue;
241       else vec = in_vec;
242     }
243 
244     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode));
245     if (e_mode == CEED_EVAL_WEIGHT) {  // Skip
246     } else {
247       // Get input element restriction
248       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
249       if (vec == CEED_VECTOR_ACTIVE) vec = in_vec;
250       // Restrict, if necessary
251       if (!impl->e_vecs[i]) {
252         // No restriction for this field; read data directly from vec.
253         CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
254       } else {
255         CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
256         // Get evec
257         CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
258       }
259     }
260   }
261   return CEED_ERROR_SUCCESS;
262 }
263 
264 //------------------------------------------------------------------------------
265 // Input Basis Action
266 //------------------------------------------------------------------------------
267 static inline int CeedOperatorInputBasis_Cuda(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
268                                               CeedInt num_input_fields, const bool skip_active_in, CeedScalar *e_data[2 * CEED_FIELD_MAX],
269                                               CeedOperator_Cuda *impl) {
270   for (CeedInt i = 0; i < num_input_fields; i++) {
271     CeedInt             elem_size, size;
272     CeedEvalMode        e_mode;
273     CeedElemRestriction elem_rstr;
274     CeedBasis           basis;
275 
276     // Skip active input
277     if (skip_active_in) {
278       CeedVector vec;
279 
280       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
281       if (vec == CEED_VECTOR_ACTIVE) continue;
282     }
283     // Get elem_size, e_mode, size
284     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
285     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
286     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode));
287     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
288     // Basis action
289     switch (e_mode) {
290       case CEED_EVAL_NONE:
291         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
292         break;
293       case CEED_EVAL_INTERP:
294         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
295         CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->e_vecs[i], impl->q_vecs_in[i]));
296         break;
297       case CEED_EVAL_GRAD:
298         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
299         CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->e_vecs[i], impl->q_vecs_in[i]));
300         break;
301       case CEED_EVAL_WEIGHT:
302         break;  // No action
303       case CEED_EVAL_DIV:
304         break;  // TODO: Not implemented
305       case CEED_EVAL_CURL:
306         break;  // TODO: Not implemented
307     }
308   }
309   return CEED_ERROR_SUCCESS;
310 }
311 
312 //------------------------------------------------------------------------------
313 // Restore Input Vectors
314 //------------------------------------------------------------------------------
315 static inline int CeedOperatorRestoreInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
316                                                  const bool skip_active_in, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) {
317   for (CeedInt i = 0; i < num_input_fields; i++) {
318     CeedEvalMode e_mode;
319     CeedVector   vec;
320 
321     // Skip active input
322     if (skip_active_in) {
323       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
324       if (vec == CEED_VECTOR_ACTIVE) continue;
325     }
326     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode));
327     if (e_mode == CEED_EVAL_WEIGHT) {  // Skip
328     } else {
329       if (!impl->e_vecs[i]) {  // This was a skip_restriction case
330         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
331         CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i]));
332       } else {
333         CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i]));
334       }
335     }
336   }
337   return CEED_ERROR_SUCCESS;
338 }
339 
340 //------------------------------------------------------------------------------
341 // Apply and add to output
342 //------------------------------------------------------------------------------
343 static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
344   CeedOperator_Cuda  *impl;
345   CeedInt             Q, num_elem, elem_size, num_input_fields, num_output_fields, size;
346   CeedEvalMode        e_mode;
347   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
348   CeedOperatorField  *op_input_fields, *op_output_fields;
349   CeedQFunctionField *qf_input_fields, *qf_output_fields;
350   CeedQFunction       qf;
351 
352   CeedCallBackend(CeedOperatorGetData(op, &impl));
353   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
354   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
355   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
356   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
357   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
358 
359   // Setup
360   CeedCallBackend(CeedOperatorSetup_Cuda(op));
361 
362   // Input e_vecs and Restriction
363   CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
364 
365   // Input basis apply if needed
366   CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl));
367 
368   // Output pointers, as necessary
369   for (CeedInt i = 0; i < num_output_fields; i++) {
370     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode));
371     if (e_mode == CEED_EVAL_NONE) {
372       // Set the output Q-Vector to use the E-Vector data directly.
373       CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
374       CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
375     }
376   }
377 
378   // Q function
379   CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out));
380 
381   // Output basis apply if needed
382   for (CeedInt i = 0; i < num_output_fields; i++) {
383     CeedElemRestriction elem_rstr;
384     CeedBasis           basis;
385 
386     // Get elem_size, e_mode, size
387     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
388     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
389     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode));
390     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
391     // Basis action
392     switch (e_mode) {
393       case CEED_EVAL_NONE:
394         break;
395       case CEED_EVAL_INTERP:
396         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
397         CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
398         break;
399       case CEED_EVAL_GRAD:
400         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
401         CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
402         break;
403       // LCOV_EXCL_START
404       case CEED_EVAL_WEIGHT: {
405         Ceed ceed;
406         CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
407         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
408         break;  // Should not occur
409       }
410       case CEED_EVAL_DIV:
411         break;  // TODO: Not implemented
412       case CEED_EVAL_CURL:
413         break;  // TODO: Not implemented
414                 // LCOV_EXCL_STOP
415     }
416   }
417 
418   // Output restriction
419   for (CeedInt i = 0; i < num_output_fields; i++) {
420     CeedVector          vec;
421     CeedElemRestriction elem_rstr;
422 
423     // Restore evec
424     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode));
425     if (e_mode == CEED_EVAL_NONE) {
426       CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
427     }
428     // Get output vector
429     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
430     // Restrict
431     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
432     // Active
433     if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
434 
435     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
436   }
437 
438   // Restore input arrays
439   CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
440   return CEED_ERROR_SUCCESS;
441 }
442 
443 //------------------------------------------------------------------------------
444 // Core code for assembling linear QFunction
445 //------------------------------------------------------------------------------
446 static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
447                                                                CeedRequest *request) {
448   Ceed                ceed, ceed_parent;
449   CeedInt             num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size;
450   CeedSize            q_size;
451   CeedScalar         *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL};
452   CeedVector         *active_inputs;
453   CeedQFunctionField *qf_input_fields, *qf_output_fields;
454   CeedQFunction       qf;
455   CeedOperatorField  *op_input_fields, *op_output_fields;
456   CeedOperator_Cuda  *impl;
457 
458   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
459   CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent));
460   CeedCallBackend(CeedOperatorGetData(op, &impl));
461   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
462   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
463   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
464   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
465   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
466   active_inputs = impl->qf_active_in;
467   num_active_in = impl->num_active_in, num_active_out = impl->num_active_out;
468 
469   // Setup
470   CeedCallBackend(CeedOperatorSetup_Cuda(op));
471 
472   // Input e_vecs and Restriction
473   CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
474 
475   // Count number of active input fields
476   if (!num_active_in) {
477     for (CeedInt i = 0; i < num_input_fields; i++) {
478       CeedScalar *q_vec_array;
479       CeedVector  vec;
480 
481       // Get input vector
482       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
483       // Check if active input
484       if (vec == CEED_VECTOR_ACTIVE) {
485         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
486         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
487         CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array));
488         CeedCallBackend(CeedRealloc(num_active_in + size, &active_inputs));
489         for (CeedInt field = 0; field < size; field++) {
490           q_size = (CeedSize)Q * num_elem;
491           CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field]));
492           CeedCallBackend(
493               CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem]));
494         }
495         num_active_in += size;
496         CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
497       }
498     }
499     impl->num_active_in = num_active_in;
500     impl->qf_active_in  = active_inputs;
501   }
502 
503   // Count number of active output fields
504   if (!num_active_out) {
505     for (CeedInt i = 0; i < num_output_fields; i++) {
506       CeedVector vec;
507 
508       // Get output vector
509       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
510       // Check if active output
511       if (vec == CEED_VECTOR_ACTIVE) {
512         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
513         num_active_out += size;
514       }
515     }
516     impl->num_active_out = num_active_out;
517   }
518 
519   // Check sizes
520   CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
521 
522   // Build objects if needed
523   if (build_objects) {
524     // Create output restriction
525     CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */
526     CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out,
527                                                      num_active_in * num_active_out * num_elem * Q, strides, rstr));
528     // Create assembled vector
529     CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out;
530     CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled));
531   }
532   CeedCallBackend(CeedVectorSetValue(*assembled, 0.0));
533   CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array));
534 
535   // Input basis apply
536   CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
537 
538   // Assemble QFunction
539   for (CeedInt in = 0; in < num_active_in; in++) {
540     // Set Inputs
541     CeedCallBackend(CeedVectorSetValue(active_inputs[in], 1.0));
542     if (num_active_in > 1) {
543       CeedCallBackend(CeedVectorSetValue(active_inputs[(in + num_active_in - 1) % num_active_in], 0.0));
544     }
545     // Set Outputs
546     for (CeedInt out = 0; out < num_output_fields; out++) {
547       CeedVector vec;
548 
549       // Get output vector
550       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
551       // Check if active output
552       if (vec == CEED_VECTOR_ACTIVE) {
553         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array));
554         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
555         assembled_array += size * Q * num_elem;  // Advance the pointer by the size of the output
556       }
557     }
558     // Apply QFunction
559     CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out));
560   }
561 
562   // Un-set output q_vecs to prevent accidental overwrite of Assembled
563   for (CeedInt out = 0; out < num_output_fields; out++) {
564     CeedVector vec;
565 
566     // Get output vector
567     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
568     // Check if active output
569     if (vec == CEED_VECTOR_ACTIVE) {
570       CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL));
571     }
572   }
573 
574   // Restore input arrays
575   CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
576 
577   // Restore output
578   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
579   return CEED_ERROR_SUCCESS;
580 }
581 
582 //------------------------------------------------------------------------------
583 // Assemble Linear QFunction
584 //------------------------------------------------------------------------------
585 static int CeedOperatorLinearAssembleQFunction_Cuda(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
586   return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, true, assembled, rstr, request);
587 }
588 
589 //------------------------------------------------------------------------------
590 // Update Assembled Linear QFunction
591 //------------------------------------------------------------------------------
592 static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
593   return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, false, &assembled, &rstr, request);
594 }
595 
596 //------------------------------------------------------------------------------
597 // Assemble diagonal setup
598 //------------------------------------------------------------------------------
599 static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) {
600   Ceed                ceed;
601   char               *diagonal_kernel_path, *diagonal_kernel_source;
602   CeedInt             num_input_fields, num_output_fields, num_e_mode_in = 0, num_comp = 0, dim = 1, num_e_mode_out = 0, num_nodes, num_qpts;
603   CeedEvalMode       *e_mode_in = NULL, *e_mode_out = NULL;
604   CeedElemRestriction rstr_in = NULL, rstr_out = NULL;
605   CeedBasis           basis_in = NULL, basis_out = NULL;
606   CeedQFunctionField *qf_fields;
607   CeedQFunction       qf;
608   CeedOperatorField  *op_fields;
609   CeedOperator_Cuda  *impl;
610 
611   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
612   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
613   CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields));
614 
615   // Determine active input basis
616   CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
617   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
618   for (CeedInt i = 0; i < num_input_fields; i++) {
619     CeedVector vec;
620 
621     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
622     if (vec == CEED_VECTOR_ACTIVE) {
623       CeedEvalMode        e_mode;
624       CeedElemRestriction rstr;
625 
626       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in));
627       CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
628       CeedCallBackend(CeedBasisGetDimension(basis_in, &dim));
629       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr));
630       CeedCheck(!rstr_in || rstr_in == rstr, ceed, CEED_ERROR_BACKEND,
631                 "Backend does not implement multi-field non-composite operator diagonal assembly");
632       rstr_in = rstr;
633       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode));
634       switch (e_mode) {
635         case CEED_EVAL_NONE:
636         case CEED_EVAL_INTERP:
637           CeedCallBackend(CeedRealloc(num_e_mode_in + 1, &e_mode_in));
638           e_mode_in[num_e_mode_in] = e_mode;
639           num_e_mode_in += 1;
640           break;
641         case CEED_EVAL_GRAD:
642           CeedCallBackend(CeedRealloc(num_e_mode_in + dim, &e_mode_in));
643           for (CeedInt d = 0; d < dim; d++) e_mode_in[num_e_mode_in + d] = e_mode;
644           num_e_mode_in += dim;
645           break;
646         case CEED_EVAL_WEIGHT:
647         case CEED_EVAL_DIV:
648         case CEED_EVAL_CURL:
649           break;  // Caught by QF Assembly
650       }
651     }
652   }
653 
654   // Determine active output basis
655   CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
656   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
657   for (CeedInt i = 0; i < num_output_fields; i++) {
658     CeedVector vec;
659 
660     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
661     if (vec == CEED_VECTOR_ACTIVE) {
662       CeedEvalMode        e_mode;
663       CeedElemRestriction rstr;
664 
665       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out));
666       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr));
667       CeedCheck(!rstr_out || rstr_out == rstr, ceed, CEED_ERROR_BACKEND,
668                 "Backend does not implement multi-field non-composite operator diagonal assembly");
669       rstr_out = rstr;
670       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode));
671       switch (e_mode) {
672         case CEED_EVAL_NONE:
673         case CEED_EVAL_INTERP:
674           CeedCallBackend(CeedRealloc(num_e_mode_out + 1, &e_mode_out));
675           e_mode_out[num_e_mode_out] = e_mode;
676           num_e_mode_out += 1;
677           break;
678         case CEED_EVAL_GRAD:
679           CeedCallBackend(CeedRealloc(num_e_mode_out + dim, &e_mode_out));
680           for (CeedInt d = 0; d < dim; d++) e_mode_out[num_e_mode_out + d] = e_mode;
681           num_e_mode_out += dim;
682           break;
683         case CEED_EVAL_WEIGHT:
684         case CEED_EVAL_DIV:
685         case CEED_EVAL_CURL:
686           break;  // Caught by QF Assembly
687       }
688     }
689   }
690 
691   // Operator data struct
692   CeedCallBackend(CeedOperatorGetData(op, &impl));
693   CeedCallBackend(CeedCalloc(1, &impl->diag));
694   CeedOperatorDiag_Cuda *diag = impl->diag;
695 
696   diag->basis_in       = basis_in;
697   diag->basis_out      = basis_out;
698   diag->h_e_mode_in    = e_mode_in;
699   diag->h_e_mode_out   = e_mode_out;
700   diag->num_e_mode_in  = num_e_mode_in;
701   diag->num_e_mode_out = num_e_mode_out;
702 
703   // Assemble kernel
704   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path));
705   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n");
706   CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source));
707   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n");
708   CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
709   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
710   diag->num_nodes = num_nodes;
711   CeedCallCuda(ceed,
712                CeedCompile_Cuda(ceed, diagonal_kernel_source, &diag->module, 6, "NUM_E_MODE_IN", num_e_mode_in, "NUM_E_MODE_OUT", num_e_mode_out,
713                                 "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "NUM_COMP", num_comp, "USE_CEEDSIZE", use_ceedsize_idx));
714   CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "linearDiagonal", &diag->linearDiagonal));
715   CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "linearPointBlockDiagonal", &diag->linearPointBlock));
716   CeedCallBackend(CeedFree(&diagonal_kernel_path));
717   CeedCallBackend(CeedFree(&diagonal_kernel_source));
718 
719   // Basis matrices
720   const CeedInt     q_bytes      = num_qpts * sizeof(CeedScalar);
721   const CeedInt     interp_bytes = q_bytes * num_nodes;
722   const CeedInt     grad_bytes   = q_bytes * num_nodes * dim;
723   const CeedInt     e_mode_bytes = sizeof(CeedEvalMode);
724   const CeedScalar *interp_in, *interp_out, *grad_in, *grad_out;
725 
726   // CEED_EVAL_NONE
727   CeedScalar *identity     = NULL;
728   bool        is_eval_none = false;
729 
730   for (CeedInt i = 0; i < num_e_mode_in; i++) is_eval_none = is_eval_none || (e_mode_in[i] == CEED_EVAL_NONE);
731   for (CeedInt i = 0; i < num_e_mode_out; i++) is_eval_none = is_eval_none || (e_mode_out[i] == CEED_EVAL_NONE);
732   if (is_eval_none) {
733     CeedCallBackend(CeedCalloc(num_qpts * num_nodes, &identity));
734     for (CeedInt i = 0; i < (num_nodes < num_qpts ? num_nodes : num_qpts); i++) identity[i * num_nodes + i] = 1.0;
735     CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_identity, interp_bytes));
736     CeedCallCuda(ceed, cudaMemcpy(diag->d_identity, identity, interp_bytes, cudaMemcpyHostToDevice));
737   }
738 
739   // CEED_EVAL_INTERP
740   CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in));
741   CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interp_in, interp_bytes));
742   CeedCallCuda(ceed, cudaMemcpy(diag->d_interp_in, interp_in, interp_bytes, cudaMemcpyHostToDevice));
743   CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out));
744   CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interp_out, interp_bytes));
745   CeedCallCuda(ceed, cudaMemcpy(diag->d_interp_out, interp_out, interp_bytes, cudaMemcpyHostToDevice));
746 
747   // CEED_EVAL_GRAD
748   CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in));
749   CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_grad_in, grad_bytes));
750   CeedCallCuda(ceed, cudaMemcpy(diag->d_grad_in, grad_in, grad_bytes, cudaMemcpyHostToDevice));
751   CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out));
752   CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_grad_out, grad_bytes));
753   CeedCallCuda(ceed, cudaMemcpy(diag->d_grad_out, grad_out, grad_bytes, cudaMemcpyHostToDevice));
754 
755   // Arrays of e_modes
756   CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_e_mode_in, num_e_mode_in * e_mode_bytes));
757   CeedCallCuda(ceed, cudaMemcpy(diag->d_e_mode_in, e_mode_in, num_e_mode_in * e_mode_bytes, cudaMemcpyHostToDevice));
758   CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_e_mode_out, num_e_mode_out * e_mode_bytes));
759   CeedCallCuda(ceed, cudaMemcpy(diag->d_e_mode_out, e_mode_out, num_e_mode_out * e_mode_bytes, cudaMemcpyHostToDevice));
760 
761   // Restriction
762   diag->diag_rstr = rstr_out;
763   return CEED_ERROR_SUCCESS;
764 }
765 
766 //------------------------------------------------------------------------------
767 // Assemble diagonal common code
768 //------------------------------------------------------------------------------
769 static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) {
770   Ceed                ceed;
771   CeedSize            assembled_length = 0, assembled_qf_length = 0;
772   CeedInt             use_ceedsize_idx = 0, num_elem;
773   CeedScalar         *elem_diag_array;
774   const CeedScalar   *assembled_qf_array;
775   CeedVector          assembled_qf = NULL;
776   CeedElemRestriction rstr         = NULL;
777   CeedOperator_Cuda  *impl;
778 
779   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
780   CeedCallBackend(CeedOperatorGetData(op, &impl));
781 
782   // Assemble QFunction
783   CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr, request));
784   CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
785 
786   CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length));
787   CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length));
788   if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
789 
790   // Setup
791   if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Cuda(op, use_ceedsize_idx));
792   CeedOperatorDiag_Cuda *diag = impl->diag;
793 
794   assert(diag != NULL);
795 
796   // Restriction
797   if (is_point_block && !diag->point_block_diag_rstr) {
798     CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(diag->diag_rstr, &diag->point_block_diag_rstr));
799   }
800   CeedElemRestriction diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr;
801 
802   // Create diagonal vector
803   CeedVector elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag;
804 
805   if (!elem_diag) {
806     CeedCallBackend(CeedElemRestrictionCreateVector(diag_rstr, NULL, &elem_diag));
807     if (is_point_block) diag->point_block_elem_diag = elem_diag;
808     else diag->elem_diag = elem_diag;
809   }
810   CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0));
811 
812   // Assemble element operator diagonals
813   CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array));
814   CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array));
815   CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem));
816 
817   // Compute the diagonal of B^T D B
818   int   elem_per_block = 1;
819   int   grid           = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0);
820   void *args[]         = {(void *)&num_elem, &diag->d_identity,  &diag->d_interp_in,  &diag->d_grad_in,    &diag->d_interp_out,
821                           &diag->d_grad_out, &diag->d_e_mode_in, &diag->d_e_mode_out, &assembled_qf_array, &elem_diag_array};
822   if (is_point_block) {
823     CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->linearPointBlock, grid, diag->num_nodes, 1, elem_per_block, args));
824   } else {
825     CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->linearDiagonal, grid, diag->num_nodes, 1, elem_per_block, args));
826   }
827 
828   // Restore arrays
829   CeedCallBackend(CeedVectorRestoreArray(elem_diag, &elem_diag_array));
830   CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array));
831 
832   // Assemble local operator diagonal
833   CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request));
834 
835   // Cleanup
836   CeedCallBackend(CeedVectorDestroy(&assembled_qf));
837   return CEED_ERROR_SUCCESS;
838 }
839 
840 //------------------------------------------------------------------------------
841 // Assemble Linear Diagonal
842 //------------------------------------------------------------------------------
843 static int CeedOperatorLinearAssembleAddDiagonal_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
844   CeedCallBackend(CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, false));
845   return CEED_ERROR_SUCCESS;
846 }
847 
848 //------------------------------------------------------------------------------
849 // Assemble Linear Point Block Diagonal
850 //------------------------------------------------------------------------------
851 static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
852   CeedCallBackend(CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, true));
853   return CEED_ERROR_SUCCESS;
854 }
855 
856 //------------------------------------------------------------------------------
857 // Single operator assembly setup
858 //------------------------------------------------------------------------------
859 static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) {
860   Ceed    ceed;
861   char   *assembly_kernel_path, *assembly_kernel_source;
862   CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_qpts = 0, elem_size = 0,
863                                                num_e_mode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0, num_elem, num_comp;
864   CeedEvalMode       *eval_mode_in = NULL, *eval_mode_out = NULL;
865   CeedElemRestriction rstr_in = NULL, rstr_out = NULL;
866   CeedBasis           basis_in = NULL, basis_out = NULL;
867   CeedQFunctionField *qf_fields;
868   CeedQFunction       qf;
869   CeedOperatorField  *input_fields, *output_fields;
870   CeedOperator_Cuda  *impl;
871 
872   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
873   CeedCallBackend(CeedOperatorGetData(op, &impl));
874 
875   // Get intput and output fields
876   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
877 
878   // Determine active input basis eval mode
879   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
880   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
881   // Note that the kernel will treat each dimension of a gradient action separately;
882   // i.e., when an active input has a CEED_EVAL_GRAD mode, num_e_mode_in will increment by dim.
883   // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, so
884   // num_B_in_mats_to_load will be incremented by 1.
885   for (CeedInt i = 0; i < num_input_fields; i++) {
886     CeedVector vec;
887 
888     CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec));
889     if (vec == CEED_VECTOR_ACTIVE) {
890       CeedEvalMode eval_mode;
891 
892       CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis_in));
893       CeedCallBackend(CeedBasisGetDimension(basis_in, &dim));
894       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
895       CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in));
896       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size));
897       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
898       if (eval_mode != CEED_EVAL_NONE) {
899         CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in));
900         eval_mode_in[num_B_in_mats_to_load] = eval_mode;
901         num_B_in_mats_to_load += 1;
902         if (eval_mode == CEED_EVAL_GRAD) {
903           num_e_mode_in += dim;
904           size_B_in += dim * elem_size * num_qpts;
905         } else {
906           num_e_mode_in += 1;
907           size_B_in += elem_size * num_qpts;
908         }
909       }
910     }
911   }
912 
913   // Determine active output basis; basis_out and rstr_out only used if same as input, TODO
914   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
915   for (CeedInt i = 0; i < num_output_fields; i++) {
916     CeedVector vec;
917 
918     CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec));
919     if (vec == CEED_VECTOR_ACTIVE) {
920       CeedEvalMode eval_mode;
921 
922       CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis_out));
923       CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out));
924       CeedCheck(!rstr_out || rstr_out == rstr_in, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly");
925       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
926       if (eval_mode != CEED_EVAL_NONE) {
927         CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out));
928         eval_mode_out[num_B_out_mats_to_load] = eval_mode;
929         num_B_out_mats_to_load += 1;
930         if (eval_mode == CEED_EVAL_GRAD) {
931           num_e_mode_out += dim;
932           size_B_out += dim * elem_size * num_qpts;
933         } else {
934           num_e_mode_out += 1;
935           size_B_out += elem_size * num_qpts;
936         }
937       }
938     }
939   }
940   CeedCheck(num_e_mode_in > 0 && num_e_mode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
941 
942   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem));
943   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp));
944 
945   CeedCallBackend(CeedCalloc(1, &impl->asmb));
946   CeedOperatorAssemble_Cuda *asmb = impl->asmb;
947   asmb->num_elem                  = num_elem;
948 
949   // Compile kernels
950   int elem_per_block    = 1;
951   asmb->elem_per_block  = elem_per_block;
952   CeedInt    block_size = elem_size * elem_size * elem_per_block;
953   Ceed_Cuda *cuda_data;
954 
955   CeedCallBackend(CeedGetData(ceed, &cuda_data));
956   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble.h", &assembly_kernel_path));
957   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n");
958   CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source));
959   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n");
960   bool fallback = block_size > cuda_data->device_prop.maxThreadsPerBlock;
961 
962   if (fallback) {
963     // Use fallback kernel with 1D threadblock
964     block_size         = elem_size * elem_per_block;
965     asmb->block_size_x = elem_size;
966     asmb->block_size_y = 1;
967   } else {  // Use kernel with 2D threadblock
968     asmb->block_size_x = elem_size;
969     asmb->block_size_y = elem_size;
970   }
971   CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 8, "NUM_ELEM", num_elem, "NUM_E_MODE_IN", num_e_mode_in,
972                                    "NUM_E_MODE_OUT", num_e_mode_out, "NUM_QPTS", num_qpts, "NUM_NODES", elem_size, "BLOCK_SIZE", block_size,
973                                    "NUM_COMP", num_comp, "USE_CEEDSIZE", use_ceedsize_idx));
974   CeedCallBackend(CeedGetKernel_Cuda(ceed, asmb->module, fallback ? "linearAssembleFallback" : "linearAssemble", &asmb->linearAssemble));
975   CeedCallBackend(CeedFree(&assembly_kernel_path));
976   CeedCallBackend(CeedFree(&assembly_kernel_source));
977 
978   // Build 'full' B matrices (not 1D arrays used for tensor-product matrices)
979   const CeedScalar *interp_in, *grad_in;
980 
981   CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in));
982   CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in));
983 
984   // Load into B_in, in order that they will be used in eval_mode
985   const CeedInt inBytes   = size_B_in * sizeof(CeedScalar);
986   CeedInt       mat_start = 0;
987 
988   CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_in, inBytes));
989   for (int i = 0; i < num_B_in_mats_to_load; i++) {
990     CeedEvalMode eval_mode = eval_mode_in[i];
991 
992     if (eval_mode == CEED_EVAL_INTERP) {
993       CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], interp_in, elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice));
994       mat_start += elem_size * num_qpts;
995     } else if (eval_mode == CEED_EVAL_GRAD) {
996       CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], grad_in, dim * elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice));
997       mat_start += dim * elem_size * num_qpts;
998     }
999   }
1000 
1001   const CeedScalar *interp_out, *grad_out;
1002 
1003   // Note that this function currently assumes 1 basis, so this should always be true for now
1004   if (basis_out == basis_in) {
1005     interp_out = interp_in;
1006     grad_out   = grad_in;
1007   } else {
1008     CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out));
1009     CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out));
1010   }
1011 
1012   // Load into B_out, in order that they will be used in eval_mode
1013   const CeedInt outBytes = size_B_out * sizeof(CeedScalar);
1014   mat_start              = 0;
1015 
1016   CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_out, outBytes));
1017   for (int i = 0; i < num_B_out_mats_to_load; i++) {
1018     CeedEvalMode eval_mode = eval_mode_out[i];
1019 
1020     if (eval_mode == CEED_EVAL_INTERP) {
1021       CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], interp_out, elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice));
1022       mat_start += elem_size * num_qpts;
1023     } else if (eval_mode == CEED_EVAL_GRAD) {
1024       CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], grad_out, dim * elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice));
1025       mat_start += dim * elem_size * num_qpts;
1026     }
1027   }
1028   return CEED_ERROR_SUCCESS;
1029 }
1030 
1031 //------------------------------------------------------------------------------
1032 // Assemble matrix data for COO matrix of assembled operator.
1033 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic.
1034 //
1035 // Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator (could have multiple basis eval
1036 // modes).
1037 // TODO: allow multiple active input restrictions/basis objects
1038 //------------------------------------------------------------------------------
1039 static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, CeedVector values) {
1040   Ceed                ceed;
1041   CeedSize            values_length = 0, assembled_qf_length = 0;
1042   CeedInt             use_ceedsize_idx = 0;
1043   CeedScalar         *values_array;
1044   const CeedScalar   *qf_array;
1045   CeedVector          assembled_qf = NULL;
1046   CeedElemRestriction rstr_q       = NULL;
1047   CeedOperator_Cuda  *impl;
1048 
1049   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1050   CeedCallBackend(CeedOperatorGetData(op, &impl));
1051 
1052   // Assemble QFunction
1053   CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE));
1054   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q));
1055   CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array));
1056   values_array += offset;
1057   CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array));
1058 
1059   CeedCallBackend(CeedVectorGetLength(values, &values_length));
1060   CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length));
1061   if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
1062   // Setup
1063   if (!impl->asmb) {
1064     CeedCallBackend(CeedSingleOperatorAssembleSetup_Cuda(op, use_ceedsize_idx));
1065     assert(impl->asmb != NULL);
1066   }
1067 
1068   // Compute B^T D B
1069   const CeedInt num_elem       = impl->asmb->num_elem;
1070   const CeedInt elem_per_block = impl->asmb->elem_per_block;
1071   const CeedInt grid           = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0);
1072   void         *args[]         = {&impl->asmb->d_B_in, &impl->asmb->d_B_out, &qf_array, &values_array};
1073 
1074   CeedCallBackend(
1075       CeedRunKernelDim_Cuda(ceed, impl->asmb->linearAssemble, grid, impl->asmb->block_size_x, impl->asmb->block_size_y, elem_per_block, args));
1076 
1077   // Restore arrays
1078   CeedCallBackend(CeedVectorRestoreArray(values, &values_array));
1079   CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &qf_array));
1080 
1081   // Cleanup
1082   CeedCallBackend(CeedVectorDestroy(&assembled_qf));
1083   return CEED_ERROR_SUCCESS;
1084 }
1085 
1086 //------------------------------------------------------------------------------
1087 // Create operator
1088 //------------------------------------------------------------------------------
1089 int CeedOperatorCreate_Cuda(CeedOperator op) {
1090   Ceed               ceed;
1091   CeedOperator_Cuda *impl;
1092 
1093   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1094   CeedCallBackend(CeedCalloc(1, &impl));
1095   CeedCallBackend(CeedOperatorSetData(op, impl));
1096 
1097   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda));
1098   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Cuda));
1099   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Cuda));
1100   CeedCallBackend(
1101       CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda));
1102   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Cuda));
1103   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda));
1104   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda));
1105   return CEED_ERROR_SUCCESS;
1106 }
1107 
1108 //------------------------------------------------------------------------------
1109