xref: /libCEED/backends/magma/ceed-magma-basis.c (revision dc8efd83546faf0200bf0bfcfb1678fae1874cc5)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed.h>
18 #include <ceed-backend.h>
19 #include "ceed-magma.h"
20 
21 #ifdef __cplusplus
22 CEED_INTERN "C"
23 #endif
24 int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem,
25                          CeedTransposeMode tmode, CeedEvalMode emode,
26                          CeedVector U, CeedVector V) {
27   int ierr;
28   Ceed ceed;
29   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
30   CeedInt dim, ncomp, ndof;
31   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
32   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
33   ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr);
34 
35   Ceed_Magma *data;
36   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
37 
38   const CeedScalar *u;
39   CeedScalar *v;
40   if (emode != CEED_EVAL_WEIGHT) {
41     ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u); CeedChkBackend(ierr);
42   } else if (emode != CEED_EVAL_WEIGHT) {
43     // LCOV_EXCL_START
44     return CeedError(ceed, CEED_ERROR_BACKEND,
45                      "An input vector is required for this CeedEvalMode");
46     // LCOV_EXCL_STOP
47   }
48   ierr = CeedVectorGetArray(V, CEED_MEM_DEVICE, &v); CeedChkBackend(ierr);
49 
50   CeedBasis_Magma *impl;
51   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
52 
53   CeedInt P1d, Q1d;
54   ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
55   ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
56 
57   CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d, comp = %d",
58             ncomp*CeedIntPow(P1d, dim), ncomp);
59 
60   if (tmode == CEED_TRANSPOSE) {
61     CeedInt length;
62     ierr = CeedVectorGetLength(V, &length); CeedChkBackend(ierr);
63     magmablas_dlaset(MagmaFull, length, 1, 0., 0., v, length, data->queue);
64     ceed_magma_queue_sync( data->queue );
65   }
66   switch (emode) {
67   case CEED_EVAL_INTERP: {
68     CeedInt P = P1d, Q = Q1d;
69     if (tmode == CEED_TRANSPOSE) {
70       P = Q1d; Q = P1d;
71     }
72 
73     // Define element sizes for dofs/quad
74     CeedInt elquadsize = CeedIntPow(Q1d, dim);
75     CeedInt eldofssize = CeedIntPow(P1d, dim);
76 
77     // E-vector ordering -------------- Q-vector ordering
78     //  component                        component
79     //    elem                             elem
80     //       node                            node
81 
82     // ---  Define strides for NOTRANSPOSE mode: ---
83     // Input (u) is E-vector, output (v) is Q-vector
84 
85     // Element strides
86     CeedInt u_elstride = eldofssize;
87     CeedInt v_elstride = elquadsize;
88     // Component strides
89     CeedInt u_compstride = nelem * eldofssize;
90     CeedInt v_compstride = nelem * elquadsize;
91 
92     // ---  Swap strides for TRANSPOSE mode: ---
93     if (tmode == CEED_TRANSPOSE) {
94       // Input (u) is Q-vector, output (v) is E-vector
95       // Element strides
96       v_elstride = eldofssize;
97       u_elstride = elquadsize;
98       // Component strides
99       v_compstride = nelem * eldofssize;
100       u_compstride = nelem * elquadsize;
101     }
102 
103     ierr = magma_interp(P, Q, dim, ncomp,
104                         impl->dinterp1d, tmode,
105                         u, u_elstride, u_compstride,
106                         v, v_elstride, v_compstride,
107                         nelem, data->basis_kernel_mode, data->maxthreads,
108                         data->queue);
109     if (ierr != 0) CeedError(ceed, CEED_ERROR_BACKEND,
110                                "MAGMA: launch failure detected for magma_interp");
111   }
112   break;
113   case CEED_EVAL_GRAD: {
114     CeedInt P = P1d, Q = Q1d;
115     // In CEED_NOTRANSPOSE mode:
116     // u is (P^dim x nc), column-major layout (nc = ncomp)
117     // v is (Q^dim x nc x dim), column-major layout (nc = ncomp)
118     // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
119     if (tmode == CEED_TRANSPOSE) {
120       P = Q1d, Q = P1d;
121     }
122 
123     // Define element sizes for dofs/quad
124     CeedInt elquadsize = CeedIntPow(Q1d, dim);
125     CeedInt eldofssize = CeedIntPow(P1d, dim);
126 
127     // E-vector ordering -------------- Q-vector ordering
128     //                                  dim
129     //  component                        component
130     //    elem                              elem
131     //       node                            node
132 
133 
134     // ---  Define strides for NOTRANSPOSE mode: ---
135     // Input (u) is E-vector, output (v) is Q-vector
136 
137     // Element strides
138     CeedInt u_elstride = eldofssize;
139     CeedInt v_elstride = elquadsize;
140     // Component strides
141     CeedInt u_compstride = nelem * eldofssize;
142     CeedInt v_compstride = nelem * elquadsize;
143     // Dimension strides
144     CeedInt u_dimstride = 0;
145     CeedInt v_dimstride = nelem * elquadsize * ncomp;
146 
147     // ---  Swap strides for TRANSPOSE mode: ---
148     if (tmode == CEED_TRANSPOSE) {
149       // Input (u) is Q-vector, output (v) is E-vector
150       // Element strides
151       v_elstride = eldofssize;
152       u_elstride = elquadsize;
153       // Component strides
154       v_compstride = nelem * eldofssize;
155       u_compstride = nelem * elquadsize;
156       // Dimension strides
157       v_dimstride = 0;
158       u_dimstride = nelem * elquadsize * ncomp;
159 
160     }
161 
162     ierr = magma_grad( P, Q, dim, ncomp,
163                        impl->dinterp1d, impl->dgrad1d, tmode,
164                        u, u_elstride, u_compstride, u_dimstride,
165                        v, v_elstride, v_compstride, v_dimstride,
166                        nelem, data->basis_kernel_mode, data->maxthreads,
167                        data->queue);
168     if (ierr != 0) CeedError(ceed, CEED_ERROR_BACKEND,
169                                "MAGMA: launch failure detected for magma_grad");
170   }
171   break;
172   case CEED_EVAL_WEIGHT: {
173     if (tmode == CEED_TRANSPOSE)
174       // LCOV_EXCL_START
175       return CeedError(ceed, CEED_ERROR_BACKEND,
176                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
177     // LCOV_EXCL_STOP
178     CeedInt Q = Q1d;
179     int eldofssize = CeedIntPow(Q, dim);
180     ierr = magma_weight(Q, dim, impl->dqweight1d, v, eldofssize, nelem,
181                         data->basis_kernel_mode, data->maxthreads, data->queue);
182     if (ierr != 0) CeedError(ceed, CEED_ERROR_BACKEND,
183                                "MAGMA: launch failure detected for magma_weight");
184   }
185   break;
186   // LCOV_EXCL_START
187   case CEED_EVAL_DIV:
188     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
189   case CEED_EVAL_CURL:
190     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
191   case CEED_EVAL_NONE:
192     return CeedError(ceed, CEED_ERROR_BACKEND,
193                      "CEED_EVAL_NONE does not make sense in this context");
194     // LCOV_EXCL_STOP
195   }
196 
197   // must sync to ensure completeness
198   ceed_magma_queue_sync( data->queue );
199 
200   if (emode!=CEED_EVAL_WEIGHT) {
201     ierr = CeedVectorRestoreArrayRead(U, &u); CeedChkBackend(ierr);
202   }
203   ierr = CeedVectorRestoreArray(V, &v); CeedChkBackend(ierr);
204   return CEED_ERROR_SUCCESS;
205 }
206 
207 #ifdef __cplusplus
208 CEED_INTERN "C"
209 #endif
210 int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt nelem,
211                                   CeedTransposeMode tmode, CeedEvalMode emode,
212                                   CeedVector U, CeedVector V) {
213   int ierr;
214   Ceed ceed;
215   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
216 
217   Ceed_Magma *data;
218   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
219 
220   CeedInt dim, ncomp, ndof, nqpt;
221   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
222   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
223   ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr);
224   ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr);
225   const CeedScalar *du;
226   CeedScalar *dv;
227   if (emode != CEED_EVAL_WEIGHT) {
228     ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr);
229   } else if (emode != CEED_EVAL_WEIGHT) {
230     // LCOV_EXCL_START
231     return CeedError(ceed, CEED_ERROR_BACKEND,
232                      "An input vector is required for this CeedEvalMode");
233     // LCOV_EXCL_STOP
234   }
235   ierr = CeedVectorGetArray(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr);
236 
237   CeedBasisNonTensor_Magma *impl;
238   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
239 
240   CeedDebug("\033[01m[CeedBasisApplyNonTensor_Magma] vsize=%d, comp = %d",
241             ncomp*ndof, ncomp);
242 
243   if (tmode == CEED_TRANSPOSE) {
244     CeedInt length;
245     ierr = CeedVectorGetLength(V, &length);
246     magmablas_dlaset(MagmaFull, length, 1, 0., 0., dv, length, data->queue);
247     ceed_magma_queue_sync( data->queue );
248   }
249   switch (emode) {
250   case CEED_EVAL_INTERP: {
251     CeedInt P = ndof, Q = nqpt;
252     if (tmode == CEED_TRANSPOSE)
253       magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
254                             P, nelem*ncomp, Q,
255                             1.0, impl->dinterp, P,
256                             du, Q,
257                             0.0, dv, P, data->queue);
258     else
259       magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans,
260                             Q, nelem*ncomp, P,
261                             1.0, impl->dinterp, P,
262                             du, P,
263                             0.0, dv, Q, data->queue);
264   }
265   break;
266 
267   case CEED_EVAL_GRAD: {
268     CeedInt P = ndof, Q = nqpt;
269     if (tmode == CEED_TRANSPOSE) {
270       double beta = 0.0;
271       for(int d=0; d<dim; d++) {
272         if (d>0)
273           beta = 1.0;
274         magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans,
275                               P, nelem*ncomp, Q,
276                               1.0, impl->dgrad + d*P*Q, P,
277                               du + d*nelem*ncomp*Q, Q,
278                               beta, dv, P, data->queue);
279       }
280     } else {
281       for(int d=0; d< dim; d++)
282         magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans,
283                               Q, nelem*ncomp, P,
284                               1.0, impl->dgrad + d*P*Q, P,
285                               du, P,
286                               0.0, dv + d*nelem*ncomp*Q, Q, data->queue);
287     }
288   }
289   break;
290 
291   case CEED_EVAL_WEIGHT: {
292     if (tmode == CEED_TRANSPOSE)
293       // LCOV_EXCL_START
294       return CeedError(ceed, CEED_ERROR_BACKEND,
295                        "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
296     // LCOV_EXCL_STOP
297 
298     int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1;
299     int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)?
300                                        1 : 0 );
301     magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv,
302                            data->queue);
303     CeedChkBackend(ierr);
304   }
305   break;
306 
307   // LCOV_EXCL_START
308   case CEED_EVAL_DIV:
309     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
310   case CEED_EVAL_CURL:
311     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
312   case CEED_EVAL_NONE:
313     return CeedError(ceed, CEED_ERROR_BACKEND,
314                      "CEED_EVAL_NONE does not make sense in this context");
315     // LCOV_EXCL_STOP
316   }
317 
318   // must sync to ensure completeness
319   ceed_magma_queue_sync( data->queue );
320 
321   if (emode!=CEED_EVAL_WEIGHT) {
322     ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr);
323   }
324   ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr);
325   return CEED_ERROR_SUCCESS;
326 }
327 
328 #ifdef __cplusplus
329 CEED_INTERN "C"
330 #endif
331 int CeedBasisDestroy_Magma(CeedBasis basis) {
332   int ierr;
333   CeedBasis_Magma *impl;
334   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
335 
336   ierr = magma_free(impl->dqref1d); CeedChkBackend(ierr);
337   ierr = magma_free(impl->dinterp1d); CeedChkBackend(ierr);
338   ierr = magma_free(impl->dgrad1d); CeedChkBackend(ierr);
339   ierr = magma_free(impl->dqweight1d); CeedChkBackend(ierr);
340 
341   ierr = CeedFree(&impl); CeedChkBackend(ierr);
342 
343   return CEED_ERROR_SUCCESS;
344 }
345 
346 #ifdef __cplusplus
347 CEED_INTERN "C"
348 #endif
349 int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
350   int ierr;
351   CeedBasisNonTensor_Magma *impl;
352   ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr);
353 
354   ierr = magma_free(impl->dqref); CeedChkBackend(ierr);
355   ierr = magma_free(impl->dinterp); CeedChkBackend(ierr);
356   ierr = magma_free(impl->dgrad); CeedChkBackend(ierr);
357   ierr = magma_free(impl->dqweight); CeedChkBackend(ierr);
358 
359   ierr = CeedFree(&impl); CeedChkBackend(ierr);
360 
361   return CEED_ERROR_SUCCESS;
362 }
363 
364 #ifdef __cplusplus
365 CEED_INTERN "C"
366 #endif
367 int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d,
368                                   const CeedScalar *interp1d,
369                                   const CeedScalar *grad1d,
370                                   const CeedScalar *qref1d,
371                                   const CeedScalar *qweight1d, CeedBasis basis) {
372   int ierr;
373   CeedBasis_Magma *impl;
374   Ceed ceed;
375   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
376 
377   // Check for supported parameters
378   CeedInt ncomp = 0;
379   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
380   if (ncomp > 3)
381     // LCOV_EXCL_START
382     return CeedError(ceed, CEED_ERROR_BACKEND,
383                      "Magma backend does not support tensor bases with more than 3 components");
384   // LCOV_EXCL_STOP
385   if (P1d > 10)
386     // LCOV_EXCL_START
387     return CeedError(ceed, CEED_ERROR_BACKEND,
388                      "Magma backend does not support tensor bases with more than 10 nodes in each dimension");
389   // LCOV_EXCL_STOP
390   if (Q1d > 10)
391     // LCOV_EXCL_START
392     return CeedError(ceed, CEED_ERROR_BACKEND,
393                      "Magma backend does not support tensor bases with more than 10 quadrature points in each dimension");
394   // LCOV_EXCL_STOP
395 
396   Ceed_Magma *data;
397   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
398 
399   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
400                                 CeedBasisApply_Magma); CeedChkBackend(ierr);
401   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
402                                 CeedBasisDestroy_Magma); CeedChkBackend(ierr);
403 
404   ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr);
405   ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr);
406 
407   // Copy qref1d to the GPU
408   ierr = magma_malloc((void **)&impl->dqref1d, Q1d*sizeof(qref1d[0]));
409   CeedChkBackend(ierr);
410   magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1,
411                   data->queue);
412 
413   // Copy interp1d to the GPU
414   ierr = magma_malloc((void **)&impl->dinterp1d, Q1d*P1d*sizeof(interp1d[0]));
415   CeedChkBackend(ierr);
416   magma_setvector(Q1d*P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1,
417                   data->queue);
418 
419   // Copy grad1d to the GPU
420   ierr = magma_malloc((void **)&impl->dgrad1d, Q1d*P1d*sizeof(grad1d[0]));
421   CeedChkBackend(ierr);
422   magma_setvector(Q1d*P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1,
423                   data->queue);
424 
425   // Copy qweight1d to the GPU
426   ierr = magma_malloc((void **)&impl->dqweight1d, Q1d*sizeof(qweight1d[0]));
427   CeedChkBackend(ierr);
428   magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1,
429                   data->queue);
430 
431   return CEED_ERROR_SUCCESS;
432 }
433 
434 #ifdef __cplusplus
435 CEED_INTERN "C"
436 #endif
437 int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof,
438                             CeedInt nqpts, const CeedScalar *interp,
439                             const CeedScalar *grad, const CeedScalar *qref,
440                             const CeedScalar *qweight, CeedBasis basis) {
441   int ierr;
442   CeedBasisNonTensor_Magma *impl;
443   Ceed ceed;
444   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
445 
446   Ceed_Magma *data;
447   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
448 
449   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
450                                 CeedBasisApplyNonTensor_Magma); CeedChkBackend(ierr);
451   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
452                                 CeedBasisDestroyNonTensor_Magma); CeedChkBackend(ierr);
453 
454   ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr);
455   ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr);
456 
457   // Copy qref to the GPU
458   ierr = magma_malloc((void **)&impl->dqref, nqpts*sizeof(qref[0]));
459   CeedChkBackend(ierr);
460   magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue);
461 
462   // Copy interp to the GPU
463   ierr = magma_malloc((void **)&impl->dinterp, nqpts*ndof*sizeof(interp[0]));
464   CeedChkBackend(ierr);
465   magma_setvector(nqpts*ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1,
466                   data->queue);
467 
468   // Copy grad to the GPU
469   ierr = magma_malloc((void **)&impl->dgrad, nqpts*ndof*dim*sizeof(grad[0]));
470   CeedChkBackend(ierr);
471   magma_setvector(nqpts*ndof*dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1,
472                   data->queue);
473 
474   // Copy qweight to the GPU
475   ierr = magma_malloc((void **)&impl->dqweight, nqpts*sizeof(qweight[0]));
476   CeedChkBackend(ierr);
477   magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1,
478                   data->queue);
479 
480   return CEED_ERROR_SUCCESS;
481 }
482