xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma.h (revision 58549094d8a305d0f4b066b44680cf34cff212e7)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 // magma functions specific to ceed
9 #ifndef _ceed_magma_h
10 #define _ceed_magma_h
11 
12 #include <ceed.h>
13 #include <ceed/backend.h>
14 #include <magma_v2.h>
15 
16 #define MAGMA_MAXTHREADS_1D 128
17 #define MAGMA_MAXTHREADS_2D 128
18 #define MAGMA_MAXTHREADS_3D 64
19 #define MAGMA_NONTENSOR_MAXTHREADS (128)
20 
21 // Define macro for determining number of threads in y-direction
22 // for basis kernels
23 #define MAGMA_BASIS_NTCOL(x, maxt) (((maxt) < (x)) ? 1 : ((maxt) / (x)))
24 #define MAGMA_NONTENSOR_BASIS_NTCOL(N) (CeedIntMax(1, (MAGMA_NONTENSOR_MAXTHREADS / (N))))
25 #define MAGMA_CEILDIV(A, B) (((A) + (B)-1) / (B))
26 
27 #define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P (40)
28 #define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q (40)
29 
30 // Define macro for computing the total threads in a block
31 // for use with __launch_bounds__()
32 #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt))
33 
34 // Define macro for non-tensor kernel instances
35 #define MAGMA_NONTENSOR_KERNEL_INSTANCES (5)
36 #define MAGMA_NONTENSOR_N_VALUES 10240, 51200, 102400, 512000, 1024000
37 
38 #ifdef CEED_MAGMA_USE_HIP
39 typedef hipModule_t   CeedMagmaModule;
40 typedef hipFunction_t CeedMagmaFunction;
41 #define CeedCompileMagma CeedCompile_Hip
42 #define CeedGetKernelMagma CeedGetKernel_Hip
43 #define CeedRunKernelMagma CeedRunKernel_Hip
44 #define CeedRunKernelDimMagma CeedRunKernelDim_Hip
45 #define CeedRunKernelDimSharedMagma CeedRunKernelDimShared_Hip
46 #else
47 typedef CUmodule   CeedMagmaModule;
48 typedef CUfunction CeedMagmaFunction;
49 #define CeedCompileMagma CeedCompile_Cuda
50 #define CeedGetKernelMagma CeedGetKernel_Cuda
51 #define CeedRunKernelMagma CeedRunKernel_Cuda
52 #define CeedRunKernelDimMagma CeedRunKernelDim_Cuda
53 #define CeedRunKernelDimSharedMagma CeedRunKernelDimShared_Cuda
54 #endif
55 
56 typedef struct {
57   magma_device_t device_id;
58   magma_queue_t  queue;
59 } Ceed_Magma;
60 
61 typedef struct {
62   CeedMagmaModule   module;
63   CeedMagmaFunction magma_interp;
64   CeedMagmaFunction magma_interp_tr;
65   CeedMagmaFunction magma_grad;
66   CeedMagmaFunction magma_grad_tr;
67   CeedMagmaFunction magma_weight;
68   CeedScalar       *dqref1d;
69   CeedScalar       *dinterp1d;
70   CeedScalar       *dgrad1d;
71   CeedScalar       *dqweight1d;
72 } CeedBasis_Magma;
73 
74 typedef struct {
75   CeedMagmaModule   module[MAGMA_NONTENSOR_KERNEL_INSTANCES];
76   CeedMagmaFunction magma_interp_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES];
77   CeedMagmaFunction magma_interp_tr_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES];
78   CeedMagmaFunction magma_grad_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES];
79   CeedMagmaFunction magma_grad_tr_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES];
80   CeedScalar       *dqref;
81   CeedScalar       *dinterp;
82   CeedScalar       *dgrad;
83   CeedScalar       *dqweight;
84 } CeedBasisNonTensor_Magma;
85 
86 CEED_INTERN void magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t nelem, magma_int_t Q, CeedScalar *dqweight, CeedScalar *dv,
87                                         magma_queue_t queue);
88 
89 CEED_INTERN int magma_gemm_nontensor(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha,
90                                      const CeedScalar *dA, magma_int_t ldda, const CeedScalar *dB, magma_int_t lddb, CeedScalar beta, CeedScalar *dC,
91                                      magma_int_t lddc, magma_queue_t queue);
92 
93 CEED_INTERN void gemm_selector(int gpu_arch, char precision, char transA, int m, int n, int k, int *nbatch, int *use_magma);
94 
95 CEED_INTERN CeedInt nontensor_rtc_get_nb(int gpu_arch, char precision, CeedEvalMode emode, CeedTransposeMode tmode, int P_, int N, int Q_);
96 
97 CEED_INTERN magma_int_t magma_isdevptr(const void *A);
98 
99 CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d,
100                                               const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis);
101 
102 CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp,
103                                         const CeedScalar *grad, const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis);
104 
105 // Comment the line below to use the default magma_is_devptr function
106 #define magma_is_devptr magma_isdevptr
107 
108 // If magma and cuda/ref are using the null stream, then ceed_magma_queue_sync should do nothing
109 #define ceed_magma_queue_sync(...)
110 
111 #endif  // _ceed_magma_h
112