1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 // magma functions specific to ceed 9 #ifndef _ceed_magma_h 10 #define _ceed_magma_h 11 12 #include <ceed/backend.h> 13 #include <ceed/ceed.h> 14 #include <magma_v2.h> 15 16 #define MAGMA_MAXTHREADS_1D 128 17 #define MAGMA_MAXTHREADS_2D 128 18 #define MAGMA_MAXTHREADS_3D 64 19 #define MAGMA_NONTENSOR_MAXTHREADS (128) 20 21 // Define macro for determining number of threads in y-direction 22 // for basis kernels 23 #define MAGMA_BASIS_NTCOL(x, maxt) (((maxt) < (x)) ? 1 : ((maxt) / (x))) 24 #define MAGMA_NONTENSOR_BASIS_NTCOL(N) (CeedIntMax(1, (MAGMA_NONTENSOR_MAXTHREADS / (N)))) 25 #define MAGMA_CEILDIV(A, B) (((A) + (B)-1) / (B)) 26 27 #define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P (40) 28 #define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q (40) 29 30 // Define macro for computing the total threads in a block 31 // for use with __launch_bounds__() 32 #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt)) 33 34 // Define macro for non-tensor kernel instances 35 #define MAGMA_NONTENSOR_KERNEL_INSTANCES (5) 36 #define MAGMA_NONTENSOR_N_VALUES 10240, 51200, 102400, 512000, 1024000 37 38 #ifdef CEED_MAGMA_USE_HIP 39 typedef hipModule_t CeedMagmaModule; 40 typedef hipFunction_t CeedMagmaFunction; 41 #define CeedCompileMagma CeedCompileHip 42 #define CeedGetKernelMagma CeedGetKernelHip 43 #define CeedRunKernelMagma CeedRunKernelHip 44 #define CeedRunKernelDimMagma CeedRunKernelDimHip 45 #define CeedRunKernelDimSharedMagma CeedRunKernelDimSharedHip 46 #else 47 typedef CUmodule CeedMagmaModule; 48 typedef CUfunction CeedMagmaFunction; 49 #define CeedCompileMagma CeedCompileCuda 50 #define CeedGetKernelMagma CeedGetKernelCuda 51 #define CeedRunKernelMagma CeedRunKernelCuda 52 #define CeedRunKernelDimMagma CeedRunKernelDimCuda 53 #define CeedRunKernelDimSharedMagma CeedRunKernelDimSharedCuda 54 #endif 55 56 typedef enum { MAGMA_KERNEL_DIM_GENERIC = 101, MAGMA_KERNEL_DIM_SPECIFIC = 102 } magma_kernel_mode_t; 57 58 typedef struct { 59 magma_kernel_mode_t basis_kernel_mode; 60 magma_device_t device; 61 magma_queue_t queue; 62 } Ceed_Magma; 63 64 typedef struct { 65 CeedMagmaModule module; 66 CeedMagmaFunction magma_interp; 67 CeedMagmaFunction magma_interp_tr; 68 CeedMagmaFunction magma_grad; 69 CeedMagmaFunction magma_grad_tr; 70 CeedMagmaFunction magma_weight; 71 CeedScalar *dqref1d; 72 CeedScalar *dinterp1d; 73 CeedScalar *dgrad1d; 74 CeedScalar *dqweight1d; 75 } CeedBasis_Magma; 76 77 typedef struct { 78 CeedMagmaModule module[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 79 CeedMagmaFunction magma_interp_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 80 CeedMagmaFunction magma_interp_tr_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 81 CeedMagmaFunction magma_grad_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 82 CeedMagmaFunction magma_grad_tr_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES]; 83 CeedScalar *dqref; 84 CeedScalar *dinterp; 85 CeedScalar *dgrad; 86 CeedScalar *dqweight; 87 } CeedBasisNonTensor_Magma; 88 89 typedef enum { 90 OWNED_NONE = 0, 91 OWNED_UNPINNED, 92 OWNED_PINNED, 93 } OwnershipMode; 94 95 typedef struct { 96 CeedMagmaModule module; 97 CeedMagmaFunction StridedTranspose; 98 CeedMagmaFunction StridedNoTranspose; 99 CeedMagmaFunction OffsetTranspose; 100 CeedMagmaFunction OffsetNoTranspose; 101 CeedInt *offsets; 102 CeedInt *doffsets; 103 OwnershipMode own_; 104 int down_; // cover a case where we own Device memory 105 } CeedElemRestriction_Magma; 106 107 typedef struct { 108 const CeedScalar **inputs; 109 CeedScalar **outputs; 110 bool setupdone; 111 } CeedQFunction_Magma; 112 113 #define USE_MAGMA_BATCH 114 #define USE_MAGMA_BATCH2 115 #define USE_MAGMA_BATCH3 116 #define USE_MAGMA_BATCH4 117 118 CEED_INTERN void magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t nelem, magma_int_t Q, CeedScalar *dqweight, CeedScalar *dv, 119 magma_queue_t queue); 120 121 CEED_INTERN int magma_gemm_nontensor(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha, 122 const CeedScalar *dA, magma_int_t ldda, const CeedScalar *dB, magma_int_t lddb, CeedScalar beta, CeedScalar *dC, 123 magma_int_t lddc, magma_queue_t queue); 124 125 CEED_INTERN void gemm_selector(int gpu_arch, char precision, char transA, int m, int n, int k, int *nbatch, int *use_magma); 126 127 CEED_INTERN CeedInt nontensor_rtc_get_nb(int gpu_arch, char precision, CeedEvalMode emode, CeedTransposeMode tmode, int P_, int N, int Q_); 128 129 CEED_INTERN magma_int_t magma_isdevptr(const void *A); 130 131 CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, 132 const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis); 133 134 CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, 135 const CeedScalar *grad, const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis); 136 137 CEED_INTERN int CeedElemRestrictionCreate_Magma(CeedMemType mtype, CeedCopyMode cmode, const CeedInt *offsets, CeedElemRestriction r); 138 139 CEED_INTERN int CeedElemRestrictionCreateBlocked_Magma(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *offsets, 140 const CeedElemRestriction res); 141 142 CEED_INTERN int CeedOperatorCreate_Magma(CeedOperator op); 143 144 // comment the line below to use the default magma_is_devptr function 145 #define magma_is_devptr magma_isdevptr 146 147 // if magma and cuda/ref are using the null stream, then ceed_magma_queue_sync 148 // should do nothing 149 #define ceed_magma_queue_sync(...) 150 151 // batch stride, override using -DMAGMA_BATCH_STRIDE=<desired-value> 152 #ifndef MAGMA_BATCH_STRIDE 153 #define MAGMA_BATCH_STRIDE (1000) 154 #endif 155 156 #endif // _ceed_magma_h 157