xref: /libCEED/backends/magma/ceed-magma.h (revision b0d170e7bc2e5c930ee481a47eb73044935a48a4)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 // magma functions specific to ceed
9 #ifndef _ceed_magma_h
10 #define _ceed_magma_h
11 
12 #include <ceed/backend.h>
13 #include <ceed/ceed.h>
14 #include <magma_v2.h>
15 
16 #define MAGMA_MAXTHREADS_1D 128
17 #define MAGMA_MAXTHREADS_2D 128
18 #define MAGMA_MAXTHREADS_3D 64
19 // Define macro for determining number of threads in y-direction
20 // for basis kernels
21 #define MAGMA_BASIS_NTCOL(x, maxt) (((maxt) < (x)) ? 1 : ((maxt) / (x)))
22 // Define macro for computing the total threads in a block
23 // for use with __launch_bounds__()
24 #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt))
25 
26 #ifdef CEED_MAGMA_USE_HIP
27 typedef hipModule_t   CeedMagmaModule;
28 typedef hipFunction_t CeedMagmaFunction;
29 #define CeedCompileMagma CeedCompileHip
30 #define CeedGetKernelMagma CeedGetKernelHip
31 #define CeedRunKernelMagma CeedRunKernelHip
32 #define CeedRunKernelDimMagma CeedRunKernelDimHip
33 #define CeedRunKernelDimSharedMagma CeedRunKernelDimSharedHip
34 #else
35 typedef CUmodule   CeedMagmaModule;
36 typedef CUfunction CeedMagmaFunction;
37 #define CeedCompileMagma CeedCompileCuda
38 #define CeedGetKernelMagma CeedGetKernelCuda
39 #define CeedRunKernelMagma CeedRunKernelCuda
40 #define CeedRunKernelDimMagma CeedRunKernelDimCuda
41 #define CeedRunKernelDimSharedMagma CeedRunKernelDimSharedCuda
42 #endif
43 
44 typedef enum { MAGMA_KERNEL_DIM_GENERIC = 101, MAGMA_KERNEL_DIM_SPECIFIC = 102 } magma_kernel_mode_t;
45 
46 typedef struct {
47   magma_kernel_mode_t basis_kernel_mode;
48   magma_device_t      device;
49   magma_queue_t       queue;
50 } Ceed_Magma;
51 
52 typedef struct {
53   CeedMagmaModule   module;
54   CeedMagmaFunction magma_interp;
55   CeedMagmaFunction magma_interp_tr;
56   CeedMagmaFunction magma_grad;
57   CeedMagmaFunction magma_grad_tr;
58   CeedMagmaFunction magma_weight;
59   CeedScalar       *dqref1d;
60   CeedScalar       *dinterp1d;
61   CeedScalar       *dgrad1d;
62   CeedScalar       *dqweight1d;
63 } CeedBasis_Magma;
64 
65 typedef struct {
66   CeedScalar *dqref;
67   CeedScalar *dinterp;
68   CeedScalar *dgrad;
69   CeedScalar *dqweight;
70 } CeedBasisNonTensor_Magma;
71 
72 typedef enum {
73   OWNED_NONE = 0,
74   OWNED_UNPINNED,
75   OWNED_PINNED,
76 } OwnershipMode;
77 
78 typedef struct {
79   CeedMagmaModule   module;
80   CeedMagmaFunction StridedTranspose;
81   CeedMagmaFunction StridedNoTranspose;
82   CeedMagmaFunction OffsetTranspose;
83   CeedMagmaFunction OffsetNoTranspose;
84   CeedInt          *offsets;
85   CeedInt          *doffsets;
86   OwnershipMode     own_;
87   int               down_;  // cover a case where we own Device memory
88 } CeedElemRestriction_Magma;
89 
90 typedef struct {
91   const CeedScalar **inputs;
92   CeedScalar       **outputs;
93   bool               setupdone;
94 } CeedQFunction_Magma;
95 
96 #define USE_MAGMA_BATCH
97 #define USE_MAGMA_BATCH2
98 #define USE_MAGMA_BATCH3
99 #define USE_MAGMA_BATCH4
100 
101 CEED_INTERN void magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t nelem, magma_int_t Q, CeedScalar *dqweight, CeedScalar *dv,
102                                         magma_queue_t queue);
103 
104 CEED_INTERN int magma_dgemm_nontensor(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha,
105                                       const double *dA, magma_int_t ldda, const double *dB, magma_int_t lddb, double beta, double *dC,
106                                       magma_int_t lddc, magma_queue_t queue);
107 
108 CEED_INTERN int magma_sgemm_nontensor(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, float alpha,
109                                       const float *dA, magma_int_t ldda, const float *dB, magma_int_t lddb, float beta, float *dC, magma_int_t lddc,
110                                       magma_queue_t queue);
111 
112 CEED_INTERN void gemm_selector(int gpu_arch, char precision, char transA, int m, int n, int k, int *nbatch, int *use_magma);
113 
114 CEED_INTERN magma_int_t magma_isdevptr(const void *A);
115 
116 CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d,
117                                               const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis);
118 
119 CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp,
120                                         const CeedScalar *grad, const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis);
121 
122 CEED_INTERN int CeedElemRestrictionCreate_Magma(CeedMemType mtype, CeedCopyMode cmode, const CeedInt *offsets, CeedElemRestriction r);
123 
124 CEED_INTERN int CeedElemRestrictionCreateBlocked_Magma(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *offsets,
125                                                        const CeedElemRestriction res);
126 
127 CEED_INTERN int CeedOperatorCreate_Magma(CeedOperator op);
128 
129 // comment the line below to use the default magma_is_devptr function
130 #define magma_is_devptr magma_isdevptr
131 
132 // if magma and cuda/ref are using the null stream, then ceed_magma_queue_sync
133 // should do nothing
134 #define ceed_magma_queue_sync(...)
135 
136 // batch stride, override using -DMAGMA_BATCH_STRIDE=<desired-value>
137 #ifndef MAGMA_BATCH_STRIDE
138 #define MAGMA_BATCH_STRIDE (1000)
139 #endif
140 
141 #endif  // _ceed_magma_h
142