1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 // magma functions specific to ceed 9 #ifndef _ceed_magma_h 10 #define _ceed_magma_h 11 12 #include <ceed/backend.h> 13 #include <ceed/ceed.h> 14 #include <magma_v2.h> 15 16 #define MAGMA_MAXTHREADS_1D 128 17 #define MAGMA_MAXTHREADS_2D 128 18 #define MAGMA_MAXTHREADS_3D 64 19 // Define macro for determining number of threads in y-direction 20 // for basis kernels 21 #define MAGMA_BASIS_NTCOL(x, maxt) (((maxt) < (x)) ? 1 : ((maxt) / (x))) 22 // Define macro for computing the total threads in a block 23 // for use with __launch_bounds__() 24 #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt)) 25 26 #ifdef CEED_MAGMA_USE_HIP 27 typedef hipModule_t CeedMagmaModule; 28 typedef hipFunction_t CeedMagmaFunction; 29 #define CeedCompileMagma CeedCompileHip 30 #define CeedGetKernelMagma CeedGetKernelHip 31 #define CeedRunKernelMagma CeedRunKernelHip 32 #define CeedRunKernelDimMagma CeedRunKernelDimHip 33 #define CeedRunKernelDimSharedMagma CeedRunKernelDimSharedHip 34 #else 35 typedef CUmodule CeedMagmaModule; 36 typedef CUfunction CeedMagmaFunction; 37 #define CeedCompileMagma CeedCompileCuda 38 #define CeedGetKernelMagma CeedGetKernelCuda 39 #define CeedRunKernelMagma CeedRunKernelCuda 40 #define CeedRunKernelDimMagma CeedRunKernelDimCuda 41 #define CeedRunKernelDimSharedMagma CeedRunKernelDimSharedCuda 42 #endif 43 44 typedef enum { MAGMA_KERNEL_DIM_GENERIC = 101, MAGMA_KERNEL_DIM_SPECIFIC = 102 } magma_kernel_mode_t; 45 46 typedef struct { 47 magma_kernel_mode_t basis_kernel_mode; 48 magma_device_t device; 49 magma_queue_t queue; 50 } Ceed_Magma; 51 52 typedef struct { 53 CeedMagmaModule module; 54 CeedMagmaFunction magma_interp; 55 CeedMagmaFunction magma_interp_tr; 56 CeedMagmaFunction magma_grad; 57 CeedMagmaFunction magma_grad_tr; 58 CeedMagmaFunction magma_weight; 59 CeedScalar *dqref1d; 60 CeedScalar *dinterp1d; 61 CeedScalar *dgrad1d; 62 CeedScalar *dqweight1d; 63 } CeedBasis_Magma; 64 65 typedef struct { 66 CeedScalar *dqref; 67 CeedScalar *dinterp; 68 CeedScalar *dgrad; 69 CeedScalar *dqweight; 70 } CeedBasisNonTensor_Magma; 71 72 typedef enum { 73 OWNED_NONE = 0, 74 OWNED_UNPINNED, 75 OWNED_PINNED, 76 } OwnershipMode; 77 78 typedef struct { 79 CeedMagmaModule module; 80 CeedMagmaFunction StridedTranspose; 81 CeedMagmaFunction StridedNoTranspose; 82 CeedMagmaFunction OffsetTranspose; 83 CeedMagmaFunction OffsetNoTranspose; 84 CeedInt *offsets; 85 CeedInt *doffsets; 86 OwnershipMode own_; 87 int down_; // cover a case where we own Device memory 88 } CeedElemRestriction_Magma; 89 90 typedef struct { 91 const CeedScalar **inputs; 92 CeedScalar **outputs; 93 bool setupdone; 94 } CeedQFunction_Magma; 95 96 #define USE_MAGMA_BATCH 97 #define USE_MAGMA_BATCH2 98 #define USE_MAGMA_BATCH3 99 #define USE_MAGMA_BATCH4 100 101 CEED_INTERN void magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t nelem, magma_int_t Q, CeedScalar *dqweight, CeedScalar *dv, 102 magma_queue_t queue); 103 104 CEED_INTERN int magma_dgemm_nontensor(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, 105 const double *dA, magma_int_t ldda, const double *dB, magma_int_t lddb, double beta, double *dC, 106 magma_int_t lddc, magma_queue_t queue); 107 108 CEED_INTERN int magma_sgemm_nontensor(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, float alpha, 109 const float *dA, magma_int_t ldda, const float *dB, magma_int_t lddb, float beta, float *dC, magma_int_t lddc, 110 magma_queue_t queue); 111 112 CEED_INTERN void gemm_selector(int gpu_arch, char precision, char transA, int m, int n, int k, int *nbatch, int *use_magma); 113 114 CEED_INTERN magma_int_t magma_isdevptr(const void *A); 115 116 CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, 117 const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis); 118 119 CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, 120 const CeedScalar *grad, const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis); 121 122 CEED_INTERN int CeedElemRestrictionCreate_Magma(CeedMemType mtype, CeedCopyMode cmode, const CeedInt *offsets, CeedElemRestriction r); 123 124 CEED_INTERN int CeedElemRestrictionCreateBlocked_Magma(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *offsets, 125 const CeedElemRestriction res); 126 127 CEED_INTERN int CeedOperatorCreate_Magma(CeedOperator op); 128 129 // comment the line below to use the default magma_is_devptr function 130 #define magma_is_devptr magma_isdevptr 131 132 // if magma and cuda/ref are using the null stream, then ceed_magma_queue_sync 133 // should do nothing 134 #define ceed_magma_queue_sync(...) 135 136 // batch stride, override using -DMAGMA_BATCH_STRIDE=<desired-value> 137 #ifndef MAGMA_BATCH_STRIDE 138 #define MAGMA_BATCH_STRIDE (1000) 139 #endif 140 141 #endif // _ceed_magma_h 142