xref: /libCEED/backends/magma/ceed-magma.h (revision 6574a04ff2135c3834f1b6ef9a4ec7566c4782db)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 // magma functions specific to ceed
9 #ifndef _ceed_magma_h
10 #define _ceed_magma_h
11 
12 #include <ceed.h>
13 #include <ceed/backend.h>
14 #include <magma_v2.h>
15 
16 #define MAGMA_MAXTHREADS_1D 128
17 #define MAGMA_MAXTHREADS_2D 128
18 #define MAGMA_MAXTHREADS_3D 64
19 #define MAGMA_NONTENSOR_MAXTHREADS (128)
20 
21 // Define macro for determining number of threads in y-direction
22 // for basis kernels
23 #define MAGMA_BASIS_NTCOL(x, maxt) (((maxt) < (x)) ? 1 : ((maxt) / (x)))
24 #define MAGMA_NONTENSOR_BASIS_NTCOL(N) (CeedIntMax(1, (MAGMA_NONTENSOR_MAXTHREADS / (N))))
25 #define MAGMA_CEILDIV(A, B) (((A) + (B)-1) / (B))
26 
27 #define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P (40)
28 #define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q (40)
29 
30 // Define macro for computing the total threads in a block
31 // for use with __launch_bounds__()
32 #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt))
33 
34 // Define macro for non-tensor kernel instances
35 #define MAGMA_NONTENSOR_KERNEL_INSTANCES (5)
36 #define MAGMA_NONTENSOR_N_VALUES 10240, 51200, 102400, 512000, 1024000
37 
38 #ifdef CEED_MAGMA_USE_HIP
39 typedef hipModule_t   CeedMagmaModule;
40 typedef hipFunction_t CeedMagmaFunction;
41 #define CeedCompileMagma CeedCompileHip
42 #define CeedGetKernelMagma CeedGetKernelHip
43 #define CeedRunKernelMagma CeedRunKernelHip
44 #define CeedRunKernelDimMagma CeedRunKernelDimHip
45 #define CeedRunKernelDimSharedMagma CeedRunKernelDimSharedHip
46 #else
47 typedef CUmodule   CeedMagmaModule;
48 typedef CUfunction CeedMagmaFunction;
49 #define CeedCompileMagma CeedCompileCuda
50 #define CeedGetKernelMagma CeedGetKernelCuda
51 #define CeedRunKernelMagma CeedRunKernelCuda
52 #define CeedRunKernelDimMagma CeedRunKernelDimCuda
53 #define CeedRunKernelDimSharedMagma CeedRunKernelDimSharedCuda
54 #endif
55 
56 typedef enum { MAGMA_KERNEL_DIM_GENERIC = 101, MAGMA_KERNEL_DIM_SPECIFIC = 102 } magma_kernel_mode_t;
57 
58 typedef struct {
59   magma_kernel_mode_t basis_kernel_mode;
60   magma_device_t      device;
61   magma_queue_t       queue;
62 } Ceed_Magma;
63 
64 typedef struct {
65   CeedMagmaModule   module;
66   CeedMagmaFunction magma_interp;
67   CeedMagmaFunction magma_interp_tr;
68   CeedMagmaFunction magma_grad;
69   CeedMagmaFunction magma_grad_tr;
70   CeedMagmaFunction magma_weight;
71   CeedScalar       *dqref1d;
72   CeedScalar       *dinterp1d;
73   CeedScalar       *dgrad1d;
74   CeedScalar       *dqweight1d;
75 } CeedBasis_Magma;
76 
77 typedef struct {
78   CeedMagmaModule   module[MAGMA_NONTENSOR_KERNEL_INSTANCES];
79   CeedMagmaFunction magma_interp_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES];
80   CeedMagmaFunction magma_interp_tr_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES];
81   CeedMagmaFunction magma_grad_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES];
82   CeedMagmaFunction magma_grad_tr_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES];
83   CeedScalar       *dqref;
84   CeedScalar       *dinterp;
85   CeedScalar       *dgrad;
86   CeedScalar       *dqweight;
87 } CeedBasisNonTensor_Magma;
88 
89 typedef enum {
90   OWNED_NONE = 0,
91   OWNED_UNPINNED,
92   OWNED_PINNED,
93 } OwnershipMode;
94 
95 typedef struct {
96   CeedMagmaModule   module;
97   CeedMagmaFunction StridedTranspose;
98   CeedMagmaFunction StridedNoTranspose;
99   CeedMagmaFunction OffsetTranspose;
100   CeedMagmaFunction OffsetNoTranspose;
101   CeedInt          *offsets;
102   CeedInt          *doffsets;
103   OwnershipMode     own_;
104   int               down_;  // cover a case where we own Device memory
105 } CeedElemRestriction_Magma;
106 
107 typedef struct {
108   const CeedScalar **inputs;
109   CeedScalar       **outputs;
110   bool               setupdone;
111 } CeedQFunction_Magma;
112 
113 #define USE_MAGMA_BATCH
114 #define USE_MAGMA_BATCH2
115 #define USE_MAGMA_BATCH3
116 #define USE_MAGMA_BATCH4
117 
118 CEED_INTERN void magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t nelem, magma_int_t Q, CeedScalar *dqweight, CeedScalar *dv,
119                                         magma_queue_t queue);
120 
121 CEED_INTERN int magma_gemm_nontensor(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha,
122                                      const CeedScalar *dA, magma_int_t ldda, const CeedScalar *dB, magma_int_t lddb, CeedScalar beta, CeedScalar *dC,
123                                      magma_int_t lddc, magma_queue_t queue);
124 
125 CEED_INTERN void gemm_selector(int gpu_arch, char precision, char transA, int m, int n, int k, int *nbatch, int *use_magma);
126 
127 CEED_INTERN CeedInt nontensor_rtc_get_nb(int gpu_arch, char precision, CeedEvalMode emode, CeedTransposeMode tmode, int P_, int N, int Q_);
128 
129 CEED_INTERN magma_int_t magma_isdevptr(const void *A);
130 
131 CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d,
132                                               const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis);
133 
134 CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp,
135                                         const CeedScalar *grad, const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis);
136 
137 CEED_INTERN int CeedElemRestrictionCreate_Magma(CeedMemType mtype, CeedCopyMode cmode, const CeedInt *offsets, CeedElemRestriction r);
138 
139 CEED_INTERN int CeedElemRestrictionCreateBlocked_Magma(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *offsets,
140                                                        const CeedElemRestriction res);
141 
142 // comment the line below to use the default magma_is_devptr function
143 #define magma_is_devptr magma_isdevptr
144 
145 // if magma and cuda/ref are using the null stream, then ceed_magma_queue_sync
146 // should do nothing
147 #define ceed_magma_queue_sync(...)
148 
149 // batch stride, override using -DMAGMA_BATCH_STRIDE=<desired-value>
150 #ifndef MAGMA_BATCH_STRIDE
151 #define MAGMA_BATCH_STRIDE (1000)
152 #endif
153 
154 #endif  // _ceed_magma_h
155