xref: /petsc/src/mat/impls/sell/seq/seqhip/sellhip.hip.cxx (revision daba9d70159ea2f6905738fcbec7404635487b2b)
1*d52a580bSJunchao Zhang #include "hip/hip_runtime.h"
2*d52a580bSJunchao Zhang #include <hip/hip_runtime.h>
3*d52a580bSJunchao Zhang 
4*d52a580bSJunchao Zhang #include <petscdevice_hip.h>
5*d52a580bSJunchao Zhang #include <petsc/private/cupmatomics.hpp>
6*d52a580bSJunchao Zhang #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
7*d52a580bSJunchao Zhang 
8*d52a580bSJunchao Zhang #define WARP_SIZE 64
9*d52a580bSJunchao Zhang 
10*d52a580bSJunchao Zhang typedef struct {
11*d52a580bSJunchao Zhang   PetscInt   maxallocmat;
12*d52a580bSJunchao Zhang   PetscInt   totalentries;
13*d52a580bSJunchao Zhang   PetscInt  *colidx; /* column index array, device pointer */
14*d52a580bSJunchao Zhang   MatScalar *val;    /* value array, device pointer */
15*d52a580bSJunchao Zhang   PetscInt   totalslices;
16*d52a580bSJunchao Zhang   PetscInt  *sliidx; /* slice index array, device pointer */
17*d52a580bSJunchao Zhang   PetscInt   nonzerostate;
18*d52a580bSJunchao Zhang   PetscInt   kernelchoice;
19*d52a580bSJunchao Zhang   PetscInt   blocky;
20*d52a580bSJunchao Zhang   PetscInt   chunksperblock;
21*d52a580bSJunchao Zhang   PetscInt   totalchunks;
22*d52a580bSJunchao Zhang   PetscInt  *chunk_slice_map; /* starting slice for each chunk, device pointer */
23*d52a580bSJunchao Zhang } Mat_SeqSELLHIP;
24*d52a580bSJunchao Zhang 
MatSeqSELLHIP_Destroy(Mat_SeqSELLHIP ** hipstruct)25*d52a580bSJunchao Zhang static PetscErrorCode MatSeqSELLHIP_Destroy(Mat_SeqSELLHIP **hipstruct)
26*d52a580bSJunchao Zhang {
27*d52a580bSJunchao Zhang   PetscFunctionBegin;
28*d52a580bSJunchao Zhang   if (*hipstruct) {
29*d52a580bSJunchao Zhang     if ((*hipstruct)->colidx) PetscCallHIP(hipFree((*hipstruct)->colidx));
30*d52a580bSJunchao Zhang     if ((*hipstruct)->val) PetscCallHIP(hipFree((*hipstruct)->val));
31*d52a580bSJunchao Zhang     if ((*hipstruct)->sliidx) PetscCallHIP(hipFree((*hipstruct)->sliidx));
32*d52a580bSJunchao Zhang     if ((*hipstruct)->chunk_slice_map) PetscCallHIP(hipFree((*hipstruct)->chunk_slice_map));
33*d52a580bSJunchao Zhang     PetscCall(PetscFree(*hipstruct));
34*d52a580bSJunchao Zhang   }
35*d52a580bSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
36*d52a580bSJunchao Zhang }
37*d52a580bSJunchao Zhang 
MatSeqSELLHIPCopyToGPU(Mat A)38*d52a580bSJunchao Zhang static PetscErrorCode MatSeqSELLHIPCopyToGPU(Mat A)
39*d52a580bSJunchao Zhang {
40*d52a580bSJunchao Zhang   Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
41*d52a580bSJunchao Zhang   Mat_SeqSELL    *a         = (Mat_SeqSELL *)A->data;
42*d52a580bSJunchao Zhang 
43*d52a580bSJunchao Zhang   PetscFunctionBegin;
44*d52a580bSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
45*d52a580bSJunchao Zhang     PetscCall(PetscLogEventBegin(MAT_HIPCopyToGPU, A, 0, 0, 0));
46*d52a580bSJunchao Zhang     if (A->assembled && A->nonzerostate == hipstruct->nonzerostate) {
47*d52a580bSJunchao Zhang       /* copy values only */
48*d52a580bSJunchao Zhang       PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), hipMemcpyHostToDevice));
49*d52a580bSJunchao Zhang       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
50*d52a580bSJunchao Zhang     } else {
51*d52a580bSJunchao Zhang       if (hipstruct->colidx) PetscCallHIP(hipFree(hipstruct->colidx));
52*d52a580bSJunchao Zhang       if (hipstruct->val) PetscCallHIP(hipFree(hipstruct->val));
53*d52a580bSJunchao Zhang       if (hipstruct->sliidx) PetscCallHIP(hipFree(hipstruct->sliidx));
54*d52a580bSJunchao Zhang       if (hipstruct->chunk_slice_map) PetscCallHIP(hipFree(hipstruct->chunk_slice_map));
55*d52a580bSJunchao Zhang       hipstruct->maxallocmat  = a->maxallocmat;
56*d52a580bSJunchao Zhang       hipstruct->totalentries = a->sliidx[a->totalslices];
57*d52a580bSJunchao Zhang       hipstruct->totalslices  = a->totalslices;
58*d52a580bSJunchao Zhang       hipstruct->totalchunks  = a->totalchunks;
59*d52a580bSJunchao Zhang       PetscCallHIP(hipMalloc((void **)&hipstruct->colidx, a->maxallocmat * sizeof(*hipstruct->colidx)));
60*d52a580bSJunchao Zhang       PetscCallHIP(hipMalloc((void **)&hipstruct->val, a->maxallocmat * sizeof(*hipstruct->val)));
61*d52a580bSJunchao Zhang       /* copy values, nz or maxallocmat? */
62*d52a580bSJunchao Zhang       PetscCallHIP(hipMemcpy(hipstruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), hipMemcpyHostToDevice));
63*d52a580bSJunchao Zhang       PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), hipMemcpyHostToDevice));
64*d52a580bSJunchao Zhang 
65*d52a580bSJunchao Zhang       PetscCallHIP(hipMalloc((void **)&hipstruct->sliidx, (a->totalslices + 1) * sizeof(*hipstruct->sliidx)));
66*d52a580bSJunchao Zhang       PetscCallHIP(hipMemcpy(hipstruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), hipMemcpyHostToDevice));
67*d52a580bSJunchao Zhang       PetscCallHIP(hipMalloc((void **)&hipstruct->chunk_slice_map, a->totalchunks * sizeof(*hipstruct->chunk_slice_map)));
68*d52a580bSJunchao Zhang       PetscCallHIP(hipMemcpy(hipstruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), hipMemcpyHostToDevice));
69*d52a580bSJunchao Zhang       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
70*d52a580bSJunchao Zhang     }
71*d52a580bSJunchao Zhang     PetscCallHIP(WaitForHIP());
72*d52a580bSJunchao Zhang     PetscCall(PetscLogEventEnd(MAT_HIPCopyToGPU, A, 0, 0, 0));
73*d52a580bSJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
74*d52a580bSJunchao Zhang   }
75*d52a580bSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
76*d52a580bSJunchao Zhang }
77*d52a580bSJunchao Zhang 
matmult_seqsell_basic_kernel(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)78*d52a580bSJunchao Zhang static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
79*d52a580bSJunchao Zhang {
80*d52a580bSJunchao Zhang   PetscInt  i, row, slice_id, row_in_slice;
81*d52a580bSJunchao Zhang   MatScalar sum;
82*d52a580bSJunchao Zhang   /* one thread per row. */
83*d52a580bSJunchao Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
84*d52a580bSJunchao Zhang   if (row < nrows) {
85*d52a580bSJunchao Zhang     slice_id     = row / sliceheight;
86*d52a580bSJunchao Zhang     row_in_slice = row % sliceheight;
87*d52a580bSJunchao Zhang     sum          = 0.0;
88*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
89*d52a580bSJunchao Zhang     y[row] = sum;
90*d52a580bSJunchao Zhang   }
91*d52a580bSJunchao Zhang }
92*d52a580bSJunchao Zhang 
matmultadd_seqsell_basic_kernel(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)93*d52a580bSJunchao Zhang static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
94*d52a580bSJunchao Zhang {
95*d52a580bSJunchao Zhang   PetscInt  i, row, slice_id, row_in_slice;
96*d52a580bSJunchao Zhang   MatScalar sum;
97*d52a580bSJunchao Zhang   /* one thread per row. */
98*d52a580bSJunchao Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
99*d52a580bSJunchao Zhang   if (row < nrows) {
100*d52a580bSJunchao Zhang     slice_id     = row / sliceheight;
101*d52a580bSJunchao Zhang     row_in_slice = row % sliceheight;
102*d52a580bSJunchao Zhang     sum          = 0.0;
103*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
104*d52a580bSJunchao Zhang     z[row] = y[row] + sum;
105*d52a580bSJunchao Zhang   }
106*d52a580bSJunchao Zhang }
107*d52a580bSJunchao Zhang 
108*d52a580bSJunchao Zhang #if !defined(PETSC_USE_COMPLEX)
109*d52a580bSJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wpass-failed")
110*d52a580bSJunchao Zhang /* use 1 block per slice, suitable for large slice width */
111*d52a580bSJunchao Zhang template <int BLOCKY>
matmult_seqsell_tiled_kernel9(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)112*d52a580bSJunchao Zhang __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
113*d52a580bSJunchao Zhang {
114*d52a580bSJunchao Zhang   __shared__ MatScalar shared[WARP_SIZE][BLOCKY];
115*d52a580bSJunchao Zhang   PetscInt             i, row, slice_id = blockIdx.x;
116*d52a580bSJunchao Zhang   int                  tid = threadIdx.x + threadIdx.y * WARP_SIZE;
117*d52a580bSJunchao Zhang   /* transposed index */
118*d52a580bSJunchao Zhang   int         tidx = tid % BLOCKY;
119*d52a580bSJunchao Zhang   int         tidy = tid / BLOCKY;
120*d52a580bSJunchao Zhang   PetscScalar t    = 0.0;
121*d52a580bSJunchao Zhang 
122*d52a580bSJunchao Zhang   row = slice_id * sliceheight + threadIdx.x % sliceheight;
123*d52a580bSJunchao Zhang   if (row < nrows) {
124*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]];
125*d52a580bSJunchao Zhang   }
126*d52a580bSJunchao Zhang   #pragma unroll
127*d52a580bSJunchao Zhang   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
128*d52a580bSJunchao Zhang   /* transpose layout to reduce each row using warp shfl */
129*d52a580bSJunchao Zhang   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
130*d52a580bSJunchao Zhang   __syncthreads();
131*d52a580bSJunchao Zhang   if (tidy < sliceheight) t = shared[tidy][tidx];
132*d52a580bSJunchao Zhang   #pragma unroll
133*d52a580bSJunchao Zhang   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY);
134*d52a580bSJunchao Zhang   if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
135*d52a580bSJunchao Zhang   __syncthreads();
136*d52a580bSJunchao Zhang   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x];
137*d52a580bSJunchao Zhang }
138*d52a580bSJunchao Zhang 
139*d52a580bSJunchao Zhang /* use 1 block per slice, suitable for large slice width */
140*d52a580bSJunchao Zhang template <int BLOCKY>
matmultadd_seqsell_tiled_kernel9(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)141*d52a580bSJunchao Zhang __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
142*d52a580bSJunchao Zhang {
143*d52a580bSJunchao Zhang   __shared__ MatScalar shared[WARP_SIZE][BLOCKY];
144*d52a580bSJunchao Zhang   PetscInt             i, row, slice_id = blockIdx.x;
145*d52a580bSJunchao Zhang   int                  tid = threadIdx.x + threadIdx.y * WARP_SIZE;
146*d52a580bSJunchao Zhang   /* transposed index */
147*d52a580bSJunchao Zhang   int         tidx = tid % BLOCKY;
148*d52a580bSJunchao Zhang   int         tidy = tid / BLOCKY;
149*d52a580bSJunchao Zhang   PetscScalar t    = 0.0;
150*d52a580bSJunchao Zhang 
151*d52a580bSJunchao Zhang   row = slice_id * sliceheight + threadIdx.x % sliceheight;
152*d52a580bSJunchao Zhang   if (row < nrows) {
153*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]];
154*d52a580bSJunchao Zhang   }
155*d52a580bSJunchao Zhang   #pragma unroll
156*d52a580bSJunchao Zhang   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
157*d52a580bSJunchao Zhang   /* transpose layout to reduce each row using warp shfl */
158*d52a580bSJunchao Zhang   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
159*d52a580bSJunchao Zhang   __syncthreads();
160*d52a580bSJunchao Zhang   if (tidy < sliceheight) t = shared[tidy][tidx];
161*d52a580bSJunchao Zhang   #pragma unroll
162*d52a580bSJunchao Zhang   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY);
163*d52a580bSJunchao Zhang   if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
164*d52a580bSJunchao Zhang   __syncthreads();
165*d52a580bSJunchao Zhang   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][threadIdx.x];
166*d52a580bSJunchao Zhang }
167*d52a580bSJunchao Zhang 
168*d52a580bSJunchao Zhang template <int BLOCKY>
segment_scan(PetscInt flag[],MatScalar shared[],PetscScalar * val)169*d52a580bSJunchao Zhang __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
170*d52a580bSJunchao Zhang {
171*d52a580bSJunchao Zhang   bool head = true;
172*d52a580bSJunchao Zhang   #pragma unroll
173*d52a580bSJunchao Zhang   for (int i = 1; i < BLOCKY * 2; i <<= 1) {
174*d52a580bSJunchao Zhang     int halfwarpid                                = threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2);
175*d52a580bSJunchao Zhang     shared[threadIdx.x + threadIdx.y * WARP_SIZE] = 0;
176*d52a580bSJunchao Zhang     if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
177*d52a580bSJunchao Zhang       shared[threadIdx.x + threadIdx.y * WARP_SIZE] = *val;
178*d52a580bSJunchao Zhang       if (i == 1) head = false;
179*d52a580bSJunchao Zhang     }
180*d52a580bSJunchao Zhang     __syncthreads();
181*d52a580bSJunchao Zhang     if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * WARP_SIZE + i * WARP_SIZE];
182*d52a580bSJunchao Zhang     __syncthreads();
183*d52a580bSJunchao Zhang   }
184*d52a580bSJunchao Zhang   return head;
185*d52a580bSJunchao Zhang }
186*d52a580bSJunchao Zhang 
187*d52a580bSJunchao Zhang /* load-balancing version. Chunksize is equal to the number of threads per block */
188*d52a580bSJunchao Zhang template <int BLOCKY>
matmult_seqsell_tiled_kernel8(PetscInt nrows,PetscInt sliceheight,PetscInt chunksperblock,PetscInt totalchunks,const PetscInt * chunk_slice_map,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)189*d52a580bSJunchao Zhang __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
190*d52a580bSJunchao Zhang {
191*d52a580bSJunchao Zhang   __shared__ MatScalar shared[BLOCKY * WARP_SIZE];
192*d52a580bSJunchao Zhang   PetscInt             gid, row, start_slice, cid;
193*d52a580bSJunchao Zhang   PetscScalar          t = 0.0;
194*d52a580bSJunchao Zhang   AtomicAdd<MatScalar> atomAdd;
195*d52a580bSJunchao Zhang   /* zero out y */
196*d52a580bSJunchao Zhang   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) {
197*d52a580bSJunchao Zhang     gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
198*d52a580bSJunchao Zhang     if (gid < nrows) y[gid] = 0.0;
199*d52a580bSJunchao Zhang   }
200*d52a580bSJunchao Zhang   for (int iter = 0; iter < chunksperblock; iter++) {
201*d52a580bSJunchao Zhang     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
202*d52a580bSJunchao Zhang     if (cid < totalchunks) {
203*d52a580bSJunchao Zhang       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
204*d52a580bSJunchao Zhang       gid         = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
205*d52a580bSJunchao Zhang       if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
206*d52a580bSJunchao Zhang         __shared__ PetscInt flag[BLOCKY * 2];
207*d52a580bSJunchao Zhang         bool                write;
208*d52a580bSJunchao Zhang         PetscInt            slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
209*d52a580bSJunchao Zhang         /* find out the slice that this element belongs to */
210*d52a580bSJunchao Zhang         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
211*d52a580bSJunchao Zhang         if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id;
212*d52a580bSJunchao Zhang         row = slice_id * sliceheight + threadIdx.x % sliceheight;
213*d52a580bSJunchao Zhang         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
214*d52a580bSJunchao Zhang         __syncthreads();
215*d52a580bSJunchao Zhang         write = segment_scan<BLOCKY>(flag, shared, &t);
216*d52a580bSJunchao Zhang         if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
217*d52a580bSJunchao Zhang         t = 0.0;
218*d52a580bSJunchao Zhang       } else { /* this iteration covers only one slice */
219*d52a580bSJunchao Zhang         row = start_slice * sliceheight + threadIdx.x % sliceheight;
220*d52a580bSJunchao Zhang         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
221*d52a580bSJunchao Zhang         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
222*d52a580bSJunchao Zhang           int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
223*d52a580bSJunchao Zhang   /* reduction and write to output vector */
224*d52a580bSJunchao Zhang   #pragma unroll
225*d52a580bSJunchao Zhang           for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
226*d52a580bSJunchao Zhang           /* transpose layout to reduce each row using warp shfl */
227*d52a580bSJunchao Zhang           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
228*d52a580bSJunchao Zhang           __syncthreads();
229*d52a580bSJunchao Zhang           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
230*d52a580bSJunchao Zhang   #pragma unroll
231*d52a580bSJunchao Zhang           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY);
232*d52a580bSJunchao Zhang           if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
233*d52a580bSJunchao Zhang           __syncthreads();
234*d52a580bSJunchao Zhang           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
235*d52a580bSJunchao Zhang           t = 0.0;
236*d52a580bSJunchao Zhang         }
237*d52a580bSJunchao Zhang       }
238*d52a580bSJunchao Zhang     }
239*d52a580bSJunchao Zhang   }
240*d52a580bSJunchao Zhang }
241*d52a580bSJunchao Zhang 
242*d52a580bSJunchao Zhang /* load-balancing version. Chunksize is equal to the number of threads per block */
243*d52a580bSJunchao Zhang template <int BLOCKY>
matmultadd_seqsell_tiled_kernel8(PetscInt nrows,PetscInt sliceheight,PetscInt chunksperblock,PetscInt totalchunks,const PetscInt * chunk_slice_map,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)244*d52a580bSJunchao Zhang __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
245*d52a580bSJunchao Zhang {
246*d52a580bSJunchao Zhang   __shared__ MatScalar shared[BLOCKY * WARP_SIZE];
247*d52a580bSJunchao Zhang   PetscInt             gid, row, start_slice, cid;
248*d52a580bSJunchao Zhang   PetscScalar          t = 0.0;
249*d52a580bSJunchao Zhang   AtomicAdd<MatScalar> atomAdd;
250*d52a580bSJunchao Zhang   /* copy y to z */
251*d52a580bSJunchao Zhang   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) {
252*d52a580bSJunchao Zhang     gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
253*d52a580bSJunchao Zhang     if (gid < nrows) z[gid] = y[gid];
254*d52a580bSJunchao Zhang   }
255*d52a580bSJunchao Zhang   for (int iter = 0; iter < chunksperblock; iter++) {
256*d52a580bSJunchao Zhang     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
257*d52a580bSJunchao Zhang     if (cid < totalchunks) {
258*d52a580bSJunchao Zhang       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
259*d52a580bSJunchao Zhang       gid         = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
260*d52a580bSJunchao Zhang       if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
261*d52a580bSJunchao Zhang         __shared__ PetscInt flag[BLOCKY * 2];
262*d52a580bSJunchao Zhang         bool                write;
263*d52a580bSJunchao Zhang         PetscInt            slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
264*d52a580bSJunchao Zhang         /* find out the slice that this element belongs to */
265*d52a580bSJunchao Zhang         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
266*d52a580bSJunchao Zhang         if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id;
267*d52a580bSJunchao Zhang         row = slice_id * sliceheight + threadIdx.x % sliceheight;
268*d52a580bSJunchao Zhang         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
269*d52a580bSJunchao Zhang         __syncthreads();
270*d52a580bSJunchao Zhang         write = segment_scan<BLOCKY>(flag, shared, &t);
271*d52a580bSJunchao Zhang         if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
272*d52a580bSJunchao Zhang         t = 0.0;
273*d52a580bSJunchao Zhang       } else { /* this iteration covers only one slice */
274*d52a580bSJunchao Zhang         row = start_slice * sliceheight + threadIdx.x % sliceheight;
275*d52a580bSJunchao Zhang         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
276*d52a580bSJunchao Zhang         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
277*d52a580bSJunchao Zhang           int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
278*d52a580bSJunchao Zhang   /* reduction and write to output vector */
279*d52a580bSJunchao Zhang   #pragma unroll
280*d52a580bSJunchao Zhang           for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
281*d52a580bSJunchao Zhang           /* transpose layout to reduce each row using warp shfl */
282*d52a580bSJunchao Zhang           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
283*d52a580bSJunchao Zhang           __syncthreads();
284*d52a580bSJunchao Zhang           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
285*d52a580bSJunchao Zhang   #pragma unroll
286*d52a580bSJunchao Zhang           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY);
287*d52a580bSJunchao Zhang           if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
288*d52a580bSJunchao Zhang           __syncthreads();
289*d52a580bSJunchao Zhang           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
290*d52a580bSJunchao Zhang           t = 0.0;
291*d52a580bSJunchao Zhang         }
292*d52a580bSJunchao Zhang       }
293*d52a580bSJunchao Zhang     }
294*d52a580bSJunchao Zhang   }
295*d52a580bSJunchao Zhang }
296*d52a580bSJunchao Zhang 
297*d52a580bSJunchao Zhang /* use 1 warp per slice, suitable for small slice width */
matmult_seqsell_tiled_kernel7(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)298*d52a580bSJunchao Zhang static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
299*d52a580bSJunchao Zhang {
300*d52a580bSJunchao Zhang   PetscInt i, row, slice_id;
301*d52a580bSJunchao Zhang   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
302*d52a580bSJunchao Zhang   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
303*d52a580bSJunchao Zhang   double t = 0.0;
304*d52a580bSJunchao Zhang   if (row < nrows) {
305*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]];
306*d52a580bSJunchao Zhang   }
307*d52a580bSJunchao Zhang   #pragma unroll
308*d52a580bSJunchao Zhang   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
309*d52a580bSJunchao Zhang   if (row < nrows && threadIdx.x < sliceheight) y[row] = t;
310*d52a580bSJunchao Zhang }
311*d52a580bSJunchao Zhang 
312*d52a580bSJunchao Zhang /* use 1 warp per slice, suitable for small slice width */
matmultadd_seqsell_tiled_kernel7(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)313*d52a580bSJunchao Zhang static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
314*d52a580bSJunchao Zhang {
315*d52a580bSJunchao Zhang   PetscInt i, row, slice_id;
316*d52a580bSJunchao Zhang   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
317*d52a580bSJunchao Zhang   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
318*d52a580bSJunchao Zhang   double t = 0.0;
319*d52a580bSJunchao Zhang   if (row < nrows) {
320*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]];
321*d52a580bSJunchao Zhang   }
322*d52a580bSJunchao Zhang   #pragma unroll
323*d52a580bSJunchao Zhang   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
324*d52a580bSJunchao Zhang   if (row < nrows && threadIdx.x < sliceheight) z[row] = y[row] + t;
325*d52a580bSJunchao Zhang }
PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()326*d52a580bSJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
327*d52a580bSJunchao Zhang #endif
328*d52a580bSJunchao Zhang 
329*d52a580bSJunchao Zhang /***********  Kernel 2-6 require a slice height smaller than 512, 256, 128, 64, 32, espectively. They are kept only for performance comparison  **********/
330*d52a580bSJunchao Zhang 
331*d52a580bSJunchao Zhang static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
332*d52a580bSJunchao Zhang {
333*d52a580bSJunchao Zhang   __shared__ MatScalar shared[32 * 16];
334*d52a580bSJunchao Zhang   PetscInt             i, row, slice_id, row_in_slice;
335*d52a580bSJunchao Zhang   /* multiple threads per row. */
336*d52a580bSJunchao Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
337*d52a580bSJunchao Zhang   if (row < nrows) {
338*d52a580bSJunchao Zhang     slice_id     = row / sliceheight;
339*d52a580bSJunchao Zhang     row_in_slice = row % sliceheight;
340*d52a580bSJunchao Zhang 
341*d52a580bSJunchao Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
342*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
343*d52a580bSJunchao Zhang     __syncthreads();
344*d52a580bSJunchao Zhang     if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
345*d52a580bSJunchao Zhang     __syncthreads();
346*d52a580bSJunchao Zhang     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
347*d52a580bSJunchao Zhang     __syncthreads();
348*d52a580bSJunchao Zhang     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
349*d52a580bSJunchao Zhang     __syncthreads();
350*d52a580bSJunchao Zhang     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
351*d52a580bSJunchao Zhang     __syncthreads();
352*d52a580bSJunchao Zhang     if (threadIdx.y < 1) {
353*d52a580bSJunchao Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
354*d52a580bSJunchao Zhang       y[row] = shared[threadIdx.x];
355*d52a580bSJunchao Zhang     }
356*d52a580bSJunchao Zhang   }
357*d52a580bSJunchao Zhang }
358*d52a580bSJunchao Zhang 
matmult_seqsell_tiled_kernel5(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)359*d52a580bSJunchao Zhang static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
360*d52a580bSJunchao Zhang {
361*d52a580bSJunchao Zhang   __shared__ MatScalar shared[32 * 16];
362*d52a580bSJunchao Zhang   PetscInt             i, row, slice_id, row_in_slice;
363*d52a580bSJunchao Zhang   /* multiple threads per row. */
364*d52a580bSJunchao Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
365*d52a580bSJunchao Zhang   if (row < nrows) {
366*d52a580bSJunchao Zhang     slice_id     = row / sliceheight;
367*d52a580bSJunchao Zhang     row_in_slice = row % sliceheight;
368*d52a580bSJunchao Zhang 
369*d52a580bSJunchao Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
370*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
371*d52a580bSJunchao Zhang     __syncthreads();
372*d52a580bSJunchao Zhang     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
373*d52a580bSJunchao Zhang     __syncthreads();
374*d52a580bSJunchao Zhang     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
375*d52a580bSJunchao Zhang     __syncthreads();
376*d52a580bSJunchao Zhang     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
377*d52a580bSJunchao Zhang     __syncthreads();
378*d52a580bSJunchao Zhang     if (threadIdx.y < 1) {
379*d52a580bSJunchao Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
380*d52a580bSJunchao Zhang       y[row] = shared[threadIdx.x];
381*d52a580bSJunchao Zhang     }
382*d52a580bSJunchao Zhang   }
383*d52a580bSJunchao Zhang }
384*d52a580bSJunchao Zhang 
matmult_seqsell_tiled_kernel4(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)385*d52a580bSJunchao Zhang static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
386*d52a580bSJunchao Zhang {
387*d52a580bSJunchao Zhang   __shared__ MatScalar shared[32 * 16];
388*d52a580bSJunchao Zhang   PetscInt             i, row, slice_id, row_in_slice;
389*d52a580bSJunchao Zhang   /* multiple threads per row. */
390*d52a580bSJunchao Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
391*d52a580bSJunchao Zhang   if (row < nrows) {
392*d52a580bSJunchao Zhang     slice_id     = row / sliceheight;
393*d52a580bSJunchao Zhang     row_in_slice = row % sliceheight;
394*d52a580bSJunchao Zhang 
395*d52a580bSJunchao Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
396*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
397*d52a580bSJunchao Zhang     __syncthreads();
398*d52a580bSJunchao Zhang     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
399*d52a580bSJunchao Zhang     __syncthreads();
400*d52a580bSJunchao Zhang     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
401*d52a580bSJunchao Zhang     __syncthreads();
402*d52a580bSJunchao Zhang     if (threadIdx.y < 1) {
403*d52a580bSJunchao Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
404*d52a580bSJunchao Zhang       y[row] = shared[threadIdx.x];
405*d52a580bSJunchao Zhang     }
406*d52a580bSJunchao Zhang   }
407*d52a580bSJunchao Zhang }
408*d52a580bSJunchao Zhang 
matmult_seqsell_tiled_kernel3(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)409*d52a580bSJunchao Zhang static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
410*d52a580bSJunchao Zhang {
411*d52a580bSJunchao Zhang   __shared__ MatScalar shared[32 * 16];
412*d52a580bSJunchao Zhang   PetscInt             i, row, slice_id, row_in_slice;
413*d52a580bSJunchao Zhang   /* multiple threads per row. */
414*d52a580bSJunchao Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
415*d52a580bSJunchao Zhang   if (row < nrows) {
416*d52a580bSJunchao Zhang     slice_id     = row / sliceheight;
417*d52a580bSJunchao Zhang     row_in_slice = row % sliceheight;
418*d52a580bSJunchao Zhang 
419*d52a580bSJunchao Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
420*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
421*d52a580bSJunchao Zhang     __syncthreads();
422*d52a580bSJunchao Zhang     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
423*d52a580bSJunchao Zhang     __syncthreads();
424*d52a580bSJunchao Zhang     if (threadIdx.y < 1) {
425*d52a580bSJunchao Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
426*d52a580bSJunchao Zhang       y[row] = shared[threadIdx.x];
427*d52a580bSJunchao Zhang     }
428*d52a580bSJunchao Zhang   }
429*d52a580bSJunchao Zhang }
430*d52a580bSJunchao Zhang 
matmult_seqsell_tiled_kernel2(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)431*d52a580bSJunchao Zhang static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
432*d52a580bSJunchao Zhang {
433*d52a580bSJunchao Zhang   __shared__ MatScalar shared[32 * 16];
434*d52a580bSJunchao Zhang   PetscInt             i, row, slice_id, row_in_slice;
435*d52a580bSJunchao Zhang   /* multiple threads per row. */
436*d52a580bSJunchao Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
437*d52a580bSJunchao Zhang   if (row < nrows) {
438*d52a580bSJunchao Zhang     slice_id     = row / sliceheight;
439*d52a580bSJunchao Zhang     row_in_slice = row % sliceheight;
440*d52a580bSJunchao Zhang 
441*d52a580bSJunchao Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
442*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
443*d52a580bSJunchao Zhang     __syncthreads();
444*d52a580bSJunchao Zhang     if (threadIdx.y < 1) {
445*d52a580bSJunchao Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
446*d52a580bSJunchao Zhang       y[row] = shared[threadIdx.x];
447*d52a580bSJunchao Zhang     }
448*d52a580bSJunchao Zhang   }
449*d52a580bSJunchao Zhang }
450*d52a580bSJunchao Zhang 
matmultadd_seqsell_tiled_kernel6(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)451*d52a580bSJunchao Zhang static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
452*d52a580bSJunchao Zhang {
453*d52a580bSJunchao Zhang   __shared__ MatScalar shared[32 * 16];
454*d52a580bSJunchao Zhang   PetscInt             i, row, slice_id, row_in_slice;
455*d52a580bSJunchao Zhang   /* multiple threads per row. */
456*d52a580bSJunchao Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
457*d52a580bSJunchao Zhang   if (row < nrows) {
458*d52a580bSJunchao Zhang     slice_id     = row / sliceheight;
459*d52a580bSJunchao Zhang     row_in_slice = row % sliceheight;
460*d52a580bSJunchao Zhang 
461*d52a580bSJunchao Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
462*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
463*d52a580bSJunchao Zhang     __syncthreads();
464*d52a580bSJunchao Zhang     if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
465*d52a580bSJunchao Zhang     __syncthreads();
466*d52a580bSJunchao Zhang     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
467*d52a580bSJunchao Zhang     __syncthreads();
468*d52a580bSJunchao Zhang     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
469*d52a580bSJunchao Zhang     __syncthreads();
470*d52a580bSJunchao Zhang     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
471*d52a580bSJunchao Zhang     __syncthreads();
472*d52a580bSJunchao Zhang     if (threadIdx.y < 1) {
473*d52a580bSJunchao Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
474*d52a580bSJunchao Zhang       z[row] = y[row] + shared[threadIdx.x];
475*d52a580bSJunchao Zhang     }
476*d52a580bSJunchao Zhang   }
477*d52a580bSJunchao Zhang }
478*d52a580bSJunchao Zhang 
matmultadd_seqsell_tiled_kernel5(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)479*d52a580bSJunchao Zhang static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
480*d52a580bSJunchao Zhang {
481*d52a580bSJunchao Zhang   __shared__ MatScalar shared[32 * 16];
482*d52a580bSJunchao Zhang   PetscInt             i, row, slice_id, row_in_slice;
483*d52a580bSJunchao Zhang   /* multiple threads per row. */
484*d52a580bSJunchao Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
485*d52a580bSJunchao Zhang   if (row < nrows) {
486*d52a580bSJunchao Zhang     slice_id     = row / sliceheight;
487*d52a580bSJunchao Zhang     row_in_slice = row % sliceheight;
488*d52a580bSJunchao Zhang 
489*d52a580bSJunchao Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
490*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
491*d52a580bSJunchao Zhang     __syncthreads();
492*d52a580bSJunchao Zhang     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
493*d52a580bSJunchao Zhang     __syncthreads();
494*d52a580bSJunchao Zhang     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
495*d52a580bSJunchao Zhang     __syncthreads();
496*d52a580bSJunchao Zhang     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
497*d52a580bSJunchao Zhang     __syncthreads();
498*d52a580bSJunchao Zhang     if (threadIdx.y < 1) {
499*d52a580bSJunchao Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
500*d52a580bSJunchao Zhang       z[row] = y[row] + shared[threadIdx.x];
501*d52a580bSJunchao Zhang     }
502*d52a580bSJunchao Zhang   }
503*d52a580bSJunchao Zhang }
504*d52a580bSJunchao Zhang 
matmultadd_seqsell_tiled_kernel4(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)505*d52a580bSJunchao Zhang static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
506*d52a580bSJunchao Zhang {
507*d52a580bSJunchao Zhang   __shared__ MatScalar shared[32 * 16];
508*d52a580bSJunchao Zhang   PetscInt             i, row, slice_id, row_in_slice;
509*d52a580bSJunchao Zhang   /* multiple threads per row. */
510*d52a580bSJunchao Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
511*d52a580bSJunchao Zhang   if (row < nrows) {
512*d52a580bSJunchao Zhang     slice_id     = row / sliceheight;
513*d52a580bSJunchao Zhang     row_in_slice = row % sliceheight;
514*d52a580bSJunchao Zhang 
515*d52a580bSJunchao Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
516*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
517*d52a580bSJunchao Zhang     __syncthreads();
518*d52a580bSJunchao Zhang     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
519*d52a580bSJunchao Zhang     __syncthreads();
520*d52a580bSJunchao Zhang     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
521*d52a580bSJunchao Zhang     __syncthreads();
522*d52a580bSJunchao Zhang     if (threadIdx.y < 1) {
523*d52a580bSJunchao Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
524*d52a580bSJunchao Zhang       z[row] = y[row] + shared[threadIdx.x];
525*d52a580bSJunchao Zhang     }
526*d52a580bSJunchao Zhang   }
527*d52a580bSJunchao Zhang }
528*d52a580bSJunchao Zhang 
matmultadd_seqsell_tiled_kernel3(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)529*d52a580bSJunchao Zhang static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
530*d52a580bSJunchao Zhang {
531*d52a580bSJunchao Zhang   __shared__ MatScalar shared[32 * 16];
532*d52a580bSJunchao Zhang   PetscInt             i, row, slice_id, row_in_slice;
533*d52a580bSJunchao Zhang   /* multiple threads per row. */
534*d52a580bSJunchao Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
535*d52a580bSJunchao Zhang   if (row < nrows) {
536*d52a580bSJunchao Zhang     slice_id     = row / sliceheight;
537*d52a580bSJunchao Zhang     row_in_slice = row % sliceheight;
538*d52a580bSJunchao Zhang 
539*d52a580bSJunchao Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
540*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
541*d52a580bSJunchao Zhang     __syncthreads();
542*d52a580bSJunchao Zhang     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
543*d52a580bSJunchao Zhang     __syncthreads();
544*d52a580bSJunchao Zhang     if (threadIdx.y < 1) {
545*d52a580bSJunchao Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
546*d52a580bSJunchao Zhang       z[row] = y[row] + shared[threadIdx.x];
547*d52a580bSJunchao Zhang     }
548*d52a580bSJunchao Zhang   }
549*d52a580bSJunchao Zhang }
550*d52a580bSJunchao Zhang 
matmultadd_seqsell_tiled_kernel2(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)551*d52a580bSJunchao Zhang static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
552*d52a580bSJunchao Zhang {
553*d52a580bSJunchao Zhang   __shared__ MatScalar shared[32 * 16];
554*d52a580bSJunchao Zhang   PetscInt             i, row, slice_id, row_in_slice;
555*d52a580bSJunchao Zhang   /* multiple threads per row. */
556*d52a580bSJunchao Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
557*d52a580bSJunchao Zhang   if (row < nrows) {
558*d52a580bSJunchao Zhang     slice_id     = row / sliceheight;
559*d52a580bSJunchao Zhang     row_in_slice = row % sliceheight;
560*d52a580bSJunchao Zhang 
561*d52a580bSJunchao Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
562*d52a580bSJunchao Zhang     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
563*d52a580bSJunchao Zhang     __syncthreads();
564*d52a580bSJunchao Zhang     if (threadIdx.y < 1) {
565*d52a580bSJunchao Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
566*d52a580bSJunchao Zhang       z[row] = y[row] + shared[threadIdx.x];
567*d52a580bSJunchao Zhang     }
568*d52a580bSJunchao Zhang   }
569*d52a580bSJunchao Zhang }
570*d52a580bSJunchao Zhang 
MatMult_SeqSELLHIP(Mat A,Vec xx,Vec yy)571*d52a580bSJunchao Zhang static PetscErrorCode MatMult_SeqSELLHIP(Mat A, Vec xx, Vec yy)
572*d52a580bSJunchao Zhang {
573*d52a580bSJunchao Zhang   Mat_SeqSELL       *a         = (Mat_SeqSELL *)A->data;
574*d52a580bSJunchao Zhang   Mat_SeqSELLHIP    *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
575*d52a580bSJunchao Zhang   PetscScalar       *y;
576*d52a580bSJunchao Zhang   const PetscScalar *x;
577*d52a580bSJunchao Zhang   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
578*d52a580bSJunchao Zhang   MatScalar         *aval;
579*d52a580bSJunchao Zhang   PetscInt          *acolidx;
580*d52a580bSJunchao Zhang   PetscInt          *sliidx;
581*d52a580bSJunchao Zhang   PetscInt           nblocks, blocksize = 512; /* blocksize is fixed to be 512 */
582*d52a580bSJunchao Zhang   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
583*d52a580bSJunchao Zhang #if !defined(PETSC_USE_COMPLEX)
584*d52a580bSJunchao Zhang   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
585*d52a580bSJunchao Zhang   PetscReal maxoveravg;
586*d52a580bSJunchao Zhang #endif
587*d52a580bSJunchao Zhang 
588*d52a580bSJunchao Zhang   PetscFunctionBegin;
589*d52a580bSJunchao Zhang   PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
590*d52a580bSJunchao Zhang   PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight > 32), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be less than 32, but the current slice height is %" PetscInt_FMT, sliceheight);
591*d52a580bSJunchao Zhang   PetscCall(MatSeqSELLHIPCopyToGPU(A));
592*d52a580bSJunchao Zhang   /* hipstruct may not be available until MatSeqSELLHIPCopyToGPU() is called */
593*d52a580bSJunchao Zhang   aval    = hipstruct->val;
594*d52a580bSJunchao Zhang   acolidx = hipstruct->colidx;
595*d52a580bSJunchao Zhang   sliidx  = hipstruct->sliidx;
596*d52a580bSJunchao Zhang 
597*d52a580bSJunchao Zhang   PetscCall(VecHIPGetArrayRead(xx, &x));
598*d52a580bSJunchao Zhang   PetscCall(VecHIPGetArrayWrite(yy, &y));
599*d52a580bSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
600*d52a580bSJunchao Zhang 
601*d52a580bSJunchao Zhang   switch (hipstruct->kernelchoice) {
602*d52a580bSJunchao Zhang #if !defined(PETSC_USE_COMPLEX)
603*d52a580bSJunchao Zhang   case 9: /* 1 slice per block */
604*d52a580bSJunchao Zhang     nblocks = 1 + (nrows - 1) / sliceheight;
605*d52a580bSJunchao Zhang     if (hipstruct->blocky == 2) {
606*d52a580bSJunchao Zhang       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
607*d52a580bSJunchao Zhang     } else if (hipstruct->blocky == 4) {
608*d52a580bSJunchao Zhang       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
609*d52a580bSJunchao Zhang     } else if (hipstruct->blocky == 8) {
610*d52a580bSJunchao Zhang       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
611*d52a580bSJunchao Zhang     } else if (hipstruct->blocky == 16) {
612*d52a580bSJunchao Zhang       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
613*d52a580bSJunchao Zhang     } else {
614*d52a580bSJunchao Zhang       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
615*d52a580bSJunchao Zhang     }
616*d52a580bSJunchao Zhang     break;
617*d52a580bSJunchao Zhang   case 7: /* each block handles blocky slices */
618*d52a580bSJunchao Zhang     nblocks = 1 + (nrows - 1) / (hipstruct->blocky * sliceheight);
619*d52a580bSJunchao Zhang     if (hipstruct->blocky == 2) {
620*d52a580bSJunchao Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
621*d52a580bSJunchao Zhang     } else if (hipstruct->blocky == 4) {
622*d52a580bSJunchao Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
623*d52a580bSJunchao Zhang     } else if (hipstruct->blocky == 8) {
624*d52a580bSJunchao Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
625*d52a580bSJunchao Zhang     } else if (hipstruct->blocky == 16) {
626*d52a580bSJunchao Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
627*d52a580bSJunchao Zhang     } else {
628*d52a580bSJunchao Zhang       nblocks = 1 + (nrows - 1) / (2 * sliceheight);
629*d52a580bSJunchao Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
630*d52a580bSJunchao Zhang     }
631*d52a580bSJunchao Zhang     break;
632*d52a580bSJunchao Zhang #endif
633*d52a580bSJunchao Zhang   case 6:
634*d52a580bSJunchao Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if sliceheight=32 */
635*d52a580bSJunchao Zhang     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
636*d52a580bSJunchao Zhang     break;
637*d52a580bSJunchao Zhang   case 5:
638*d52a580bSJunchao Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if sliceheight=32*/
639*d52a580bSJunchao Zhang     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
640*d52a580bSJunchao Zhang     break;
641*d52a580bSJunchao Zhang   case 4:
642*d52a580bSJunchao Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if sliceheight=32 */
643*d52a580bSJunchao Zhang     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
644*d52a580bSJunchao Zhang     break;
645*d52a580bSJunchao Zhang   case 3:
646*d52a580bSJunchao Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if sliceheight=32 */
647*d52a580bSJunchao Zhang     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
648*d52a580bSJunchao Zhang     break;
649*d52a580bSJunchao Zhang   case 2: /* 16 slices per block if sliceheight=32 */
650*d52a580bSJunchao Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 2);
651*d52a580bSJunchao Zhang     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
652*d52a580bSJunchao Zhang     break;
653*d52a580bSJunchao Zhang   case 1: /* 32 slices per block if sliceheight=32 */
654*d52a580bSJunchao Zhang     nblocks = 1 + (nrows - 1) / blocksize;
655*d52a580bSJunchao Zhang     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
656*d52a580bSJunchao Zhang     break;
657*d52a580bSJunchao Zhang #if !defined(PETSC_USE_COMPLEX)
658*d52a580bSJunchao Zhang   case 0:
659*d52a580bSJunchao Zhang     maxoveravg = a->maxslicewidth / a->avgslicewidth;
660*d52a580bSJunchao Zhang     if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
661*d52a580bSJunchao Zhang       /* each block handles approximately one slice */
662*d52a580bSJunchao Zhang       PetscInt blocky = a->chunksize / 32;
663*d52a580bSJunchao Zhang       nchunks         = hipstruct->totalchunks;
664*d52a580bSJunchao Zhang       chunksperblock  = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
665*d52a580bSJunchao Zhang       nblocks         = 1 + (nchunks - 1) / chunksperblock;
666*d52a580bSJunchao Zhang       chunk_slice_map = hipstruct->chunk_slice_map;
667*d52a580bSJunchao Zhang       if (blocky == 2) {
668*d52a580bSJunchao Zhang         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
669*d52a580bSJunchao Zhang       } else if (blocky == 4) {
670*d52a580bSJunchao Zhang         matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
671*d52a580bSJunchao Zhang       } else if (blocky == 8) {
672*d52a580bSJunchao Zhang         matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
673*d52a580bSJunchao Zhang       } else if (blocky == 16) {
674*d52a580bSJunchao Zhang         matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
675*d52a580bSJunchao Zhang       } else {
676*d52a580bSJunchao Zhang         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
677*d52a580bSJunchao Zhang       }
678*d52a580bSJunchao Zhang     } else {
679*d52a580bSJunchao Zhang       PetscInt avgslicesize = sliceheight * a->avgslicewidth;
680*d52a580bSJunchao Zhang       if (avgslicesize <= 432) {
681*d52a580bSJunchao Zhang         if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
682*d52a580bSJunchao Zhang           nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
683*d52a580bSJunchao Zhang           matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
684*d52a580bSJunchao Zhang         } else {
685*d52a580bSJunchao Zhang           nblocks = 1 + (nrows - 1) / sliceheight;
686*d52a580bSJunchao Zhang           matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
687*d52a580bSJunchao Zhang         }
688*d52a580bSJunchao Zhang       } else if (avgslicesize <= 2400) {
689*d52a580bSJunchao Zhang         nblocks = 1 + (nrows - 1) / sliceheight;
690*d52a580bSJunchao Zhang         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
691*d52a580bSJunchao Zhang       } else {
692*d52a580bSJunchao Zhang         nblocks = 1 + (nrows - 1) / sliceheight;
693*d52a580bSJunchao Zhang         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
694*d52a580bSJunchao Zhang       }
695*d52a580bSJunchao Zhang     }
696*d52a580bSJunchao Zhang     break;
697*d52a580bSJunchao Zhang #endif
698*d52a580bSJunchao Zhang   default:
699*d52a580bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice);
700*d52a580bSJunchao Zhang   }
701*d52a580bSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
702*d52a580bSJunchao Zhang   PetscCall(VecHIPRestoreArrayRead(xx, &x));
703*d52a580bSJunchao Zhang   PetscCall(VecHIPRestoreArrayWrite(yy, &y));
704*d52a580bSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
705*d52a580bSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
706*d52a580bSJunchao Zhang }
707*d52a580bSJunchao Zhang 
MatMultAdd_SeqSELLHIP(Mat A,Vec xx,Vec yy,Vec zz)708*d52a580bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqSELLHIP(Mat A, Vec xx, Vec yy, Vec zz)
709*d52a580bSJunchao Zhang {
710*d52a580bSJunchao Zhang   Mat_SeqSELL       *a         = (Mat_SeqSELL *)A->data;
711*d52a580bSJunchao Zhang   Mat_SeqSELLHIP    *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
712*d52a580bSJunchao Zhang   PetscScalar       *z;
713*d52a580bSJunchao Zhang   const PetscScalar *y, *x;
714*d52a580bSJunchao Zhang   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
715*d52a580bSJunchao Zhang   MatScalar         *aval    = hipstruct->val;
716*d52a580bSJunchao Zhang   PetscInt          *acolidx = hipstruct->colidx;
717*d52a580bSJunchao Zhang   PetscInt          *sliidx  = hipstruct->sliidx;
718*d52a580bSJunchao Zhang #if !defined(PETSC_USE_COMPLEX)
719*d52a580bSJunchao Zhang   PetscReal maxoveravg;
720*d52a580bSJunchao Zhang   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
721*d52a580bSJunchao Zhang   PetscInt  blocky = hipstruct->blocky;
722*d52a580bSJunchao Zhang #endif
723*d52a580bSJunchao Zhang 
724*d52a580bSJunchao Zhang   PetscFunctionBegin;
725*d52a580bSJunchao Zhang   PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
726*d52a580bSJunchao Zhang   PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight != sliceheight), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
727*d52a580bSJunchao Zhang   PetscCall(MatSeqSELLHIPCopyToGPU(A));
728*d52a580bSJunchao Zhang   if (a->nz) {
729*d52a580bSJunchao Zhang     PetscInt nblocks, blocksize = 512;
730*d52a580bSJunchao Zhang     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
731*d52a580bSJunchao Zhang     PetscCall(VecHIPGetArrayRead(xx, &x));
732*d52a580bSJunchao Zhang     PetscCall(VecHIPGetArrayRead(yy, &y));
733*d52a580bSJunchao Zhang     PetscCall(VecHIPGetArrayWrite(zz, &z));
734*d52a580bSJunchao Zhang     PetscCall(PetscLogGpuTimeBegin());
735*d52a580bSJunchao Zhang 
736*d52a580bSJunchao Zhang     switch (hipstruct->kernelchoice) {
737*d52a580bSJunchao Zhang #if !defined(PETSC_USE_COMPLEX)
738*d52a580bSJunchao Zhang     case 9:
739*d52a580bSJunchao Zhang       nblocks = 1 + (nrows - 1) / sliceheight;
740*d52a580bSJunchao Zhang       if (blocky == 2) {
741*d52a580bSJunchao Zhang         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
742*d52a580bSJunchao Zhang       } else if (blocky == 4) {
743*d52a580bSJunchao Zhang         matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
744*d52a580bSJunchao Zhang       } else if (blocky == 8) {
745*d52a580bSJunchao Zhang         matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
746*d52a580bSJunchao Zhang       } else if (blocky == 16) {
747*d52a580bSJunchao Zhang         matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
748*d52a580bSJunchao Zhang       } else {
749*d52a580bSJunchao Zhang         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
750*d52a580bSJunchao Zhang       }
751*d52a580bSJunchao Zhang       break;
752*d52a580bSJunchao Zhang     case 8:
753*d52a580bSJunchao Zhang       /* each block handles approximately one slice */
754*d52a580bSJunchao Zhang       nchunks         = hipstruct->totalchunks;
755*d52a580bSJunchao Zhang       blocky          = a->chunksize / 32;
756*d52a580bSJunchao Zhang       chunksperblock  = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
757*d52a580bSJunchao Zhang       nblocks         = 1 + (nchunks - 1) / chunksperblock;
758*d52a580bSJunchao Zhang       chunk_slice_map = hipstruct->chunk_slice_map;
759*d52a580bSJunchao Zhang       if (blocky == 2) {
760*d52a580bSJunchao Zhang         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
761*d52a580bSJunchao Zhang       } else if (blocky == 4) {
762*d52a580bSJunchao Zhang         matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
763*d52a580bSJunchao Zhang       } else if (blocky == 8) {
764*d52a580bSJunchao Zhang         matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
765*d52a580bSJunchao Zhang       } else if (blocky == 16) {
766*d52a580bSJunchao Zhang         matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
767*d52a580bSJunchao Zhang       } else {
768*d52a580bSJunchao Zhang         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
769*d52a580bSJunchao Zhang       }
770*d52a580bSJunchao Zhang       break;
771*d52a580bSJunchao Zhang     case 7:
772*d52a580bSJunchao Zhang       nblocks = 1 + (nrows - 1) / (blocky * sliceheight);
773*d52a580bSJunchao Zhang       if (blocky == 2) {
774*d52a580bSJunchao Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
775*d52a580bSJunchao Zhang       } else if (blocky == 4) {
776*d52a580bSJunchao Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
777*d52a580bSJunchao Zhang       } else if (blocky == 8) {
778*d52a580bSJunchao Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
779*d52a580bSJunchao Zhang       } else if (blocky == 16) {
780*d52a580bSJunchao Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
781*d52a580bSJunchao Zhang       } else {
782*d52a580bSJunchao Zhang         nblocks = 1 + (nrows - 1) / (2 * sliceheight);
783*d52a580bSJunchao Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
784*d52a580bSJunchao Zhang       }
785*d52a580bSJunchao Zhang       break;
786*d52a580bSJunchao Zhang #endif
787*d52a580bSJunchao Zhang     case 6:
788*d52a580bSJunchao Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 32);
789*d52a580bSJunchao Zhang       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
790*d52a580bSJunchao Zhang       break;
791*d52a580bSJunchao Zhang     case 5:
792*d52a580bSJunchao Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 16);
793*d52a580bSJunchao Zhang       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
794*d52a580bSJunchao Zhang       break;
795*d52a580bSJunchao Zhang     case 4:
796*d52a580bSJunchao Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 8);
797*d52a580bSJunchao Zhang       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
798*d52a580bSJunchao Zhang       break;
799*d52a580bSJunchao Zhang     case 3:
800*d52a580bSJunchao Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 4);
801*d52a580bSJunchao Zhang       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
802*d52a580bSJunchao Zhang       break;
803*d52a580bSJunchao Zhang     case 2:
804*d52a580bSJunchao Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 2);
805*d52a580bSJunchao Zhang       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
806*d52a580bSJunchao Zhang       break;
807*d52a580bSJunchao Zhang     case 1:
808*d52a580bSJunchao Zhang       nblocks = 1 + (nrows - 1) / blocksize;
809*d52a580bSJunchao Zhang       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
810*d52a580bSJunchao Zhang       break;
811*d52a580bSJunchao Zhang #if !defined(PETSC_USE_COMPLEX)
812*d52a580bSJunchao Zhang     case 0:
813*d52a580bSJunchao Zhang       maxoveravg = a->maxslicewidth / a->avgslicewidth;
814*d52a580bSJunchao Zhang       if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
815*d52a580bSJunchao Zhang         /* each block handles approximately one slice */
816*d52a580bSJunchao Zhang         nchunks         = hipstruct->totalchunks;
817*d52a580bSJunchao Zhang         blocky          = a->chunksize / 32;
818*d52a580bSJunchao Zhang         chunksperblock  = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
819*d52a580bSJunchao Zhang         nblocks         = 1 + (nchunks - 1) / chunksperblock;
820*d52a580bSJunchao Zhang         chunk_slice_map = hipstruct->chunk_slice_map;
821*d52a580bSJunchao Zhang         if (blocky == 2) {
822*d52a580bSJunchao Zhang           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
823*d52a580bSJunchao Zhang         } else if (blocky == 4) {
824*d52a580bSJunchao Zhang           matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
825*d52a580bSJunchao Zhang         } else if (blocky == 8) {
826*d52a580bSJunchao Zhang           matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
827*d52a580bSJunchao Zhang         } else if (blocky == 16) {
828*d52a580bSJunchao Zhang           matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
829*d52a580bSJunchao Zhang         } else {
830*d52a580bSJunchao Zhang           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
831*d52a580bSJunchao Zhang         }
832*d52a580bSJunchao Zhang       } else {
833*d52a580bSJunchao Zhang         PetscInt avgslicesize = sliceheight * a->avgslicewidth;
834*d52a580bSJunchao Zhang         if (avgslicesize <= 432) {
835*d52a580bSJunchao Zhang           if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
836*d52a580bSJunchao Zhang             nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
837*d52a580bSJunchao Zhang             matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
838*d52a580bSJunchao Zhang           } else {
839*d52a580bSJunchao Zhang             nblocks = 1 + (nrows - 1) / sliceheight;
840*d52a580bSJunchao Zhang             matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
841*d52a580bSJunchao Zhang           }
842*d52a580bSJunchao Zhang         } else if (avgslicesize <= 2400) {
843*d52a580bSJunchao Zhang           nblocks = 1 + (nrows - 1) / sliceheight;
844*d52a580bSJunchao Zhang           matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
845*d52a580bSJunchao Zhang         } else {
846*d52a580bSJunchao Zhang           nblocks = 1 + (nrows - 1) / sliceheight;
847*d52a580bSJunchao Zhang           matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
848*d52a580bSJunchao Zhang         }
849*d52a580bSJunchao Zhang       }
850*d52a580bSJunchao Zhang       break;
851*d52a580bSJunchao Zhang #endif
852*d52a580bSJunchao Zhang     default:
853*d52a580bSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice);
854*d52a580bSJunchao Zhang     }
855*d52a580bSJunchao Zhang     PetscCall(PetscLogGpuTimeEnd());
856*d52a580bSJunchao Zhang     PetscCall(VecHIPRestoreArrayRead(xx, &x));
857*d52a580bSJunchao Zhang     PetscCall(VecHIPRestoreArrayRead(yy, &y));
858*d52a580bSJunchao Zhang     PetscCall(VecHIPRestoreArrayWrite(zz, &z));
859*d52a580bSJunchao Zhang     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
860*d52a580bSJunchao Zhang   } else {
861*d52a580bSJunchao Zhang     PetscCall(VecCopy(yy, zz));
862*d52a580bSJunchao Zhang   }
863*d52a580bSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
864*d52a580bSJunchao Zhang }
865*d52a580bSJunchao Zhang 
MatSetFromOptions_SeqSELLHIP(Mat A,PetscOptionItems PetscOptionsObject)866*d52a580bSJunchao Zhang static PetscErrorCode MatSetFromOptions_SeqSELLHIP(Mat A, PetscOptionItems PetscOptionsObject)
867*d52a580bSJunchao Zhang {
868*d52a580bSJunchao Zhang   Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
869*d52a580bSJunchao Zhang   PetscInt        kernel, blocky;
870*d52a580bSJunchao Zhang   PetscBool       flg;
871*d52a580bSJunchao Zhang 
872*d52a580bSJunchao Zhang   PetscFunctionBegin;
873*d52a580bSJunchao Zhang   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLHIP options");
874*d52a580bSJunchao Zhang   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_blocky", &blocky, &flg));
875*d52a580bSJunchao Zhang   if (flg) {
876*d52a580bSJunchao Zhang     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
877*d52a580bSJunchao Zhang     hipstruct->blocky = blocky;
878*d52a580bSJunchao Zhang   }
879*d52a580bSJunchao Zhang   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_kernel", &kernel, &flg));
880*d52a580bSJunchao Zhang   if (flg) {
881*d52a580bSJunchao Zhang     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
882*d52a580bSJunchao Zhang     hipstruct->kernelchoice = kernel;
883*d52a580bSJunchao Zhang     if (kernel == 8) PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_chunksperblock", &hipstruct->chunksperblock, &flg));
884*d52a580bSJunchao Zhang   }
885*d52a580bSJunchao Zhang   PetscOptionsHeadEnd();
886*d52a580bSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
887*d52a580bSJunchao Zhang }
888*d52a580bSJunchao Zhang 
MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)889*d52a580bSJunchao Zhang PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
890*d52a580bSJunchao Zhang {
891*d52a580bSJunchao Zhang   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
892*d52a580bSJunchao Zhang 
893*d52a580bSJunchao Zhang   PetscFunctionBegin;
894*d52a580bSJunchao Zhang   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
895*d52a580bSJunchao Zhang   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
896*d52a580bSJunchao Zhang   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
897*d52a580bSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
898*d52a580bSJunchao Zhang }
899*d52a580bSJunchao Zhang 
MatAssemblyEnd_SeqSELLHIP(Mat A,MatAssemblyType mode)900*d52a580bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqSELLHIP(Mat A, MatAssemblyType mode)
901*d52a580bSJunchao Zhang {
902*d52a580bSJunchao Zhang   PetscFunctionBegin;
903*d52a580bSJunchao Zhang   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
904*d52a580bSJunchao Zhang   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
905*d52a580bSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
906*d52a580bSJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqSELLHIPCopyToGPU(A));
907*d52a580bSJunchao Zhang   A->ops->mult    = MatMult_SeqSELLHIP;
908*d52a580bSJunchao Zhang   A->ops->multadd = MatMultAdd_SeqSELLHIP;
909*d52a580bSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
910*d52a580bSJunchao Zhang }
911*d52a580bSJunchao Zhang 
MatZeroEntries_SeqSELLHIP(Mat A)912*d52a580bSJunchao Zhang static PetscErrorCode MatZeroEntries_SeqSELLHIP(Mat A)
913*d52a580bSJunchao Zhang {
914*d52a580bSJunchao Zhang   PetscBool    both = PETSC_FALSE;
915*d52a580bSJunchao Zhang   Mat_SeqSELL *a    = (Mat_SeqSELL *)A->data;
916*d52a580bSJunchao Zhang 
917*d52a580bSJunchao Zhang   PetscFunctionBegin;
918*d52a580bSJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
919*d52a580bSJunchao Zhang     Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
920*d52a580bSJunchao Zhang     if (hipstruct->val) {
921*d52a580bSJunchao Zhang       both = PETSC_TRUE;
922*d52a580bSJunchao Zhang       PetscCallHIP(hipMemset(hipstruct->val, 0, a->sliidx[a->totalslices] * sizeof(*hipstruct->val)));
923*d52a580bSJunchao Zhang     }
924*d52a580bSJunchao Zhang   }
925*d52a580bSJunchao Zhang   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
926*d52a580bSJunchao Zhang   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
927*d52a580bSJunchao Zhang   else A->offloadmask = PETSC_OFFLOAD_CPU;
928*d52a580bSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
929*d52a580bSJunchao Zhang }
930*d52a580bSJunchao Zhang 
MatDestroy_SeqSELLHIP(Mat A)931*d52a580bSJunchao Zhang static PetscErrorCode MatDestroy_SeqSELLHIP(Mat A)
932*d52a580bSJunchao Zhang {
933*d52a580bSJunchao Zhang   PetscFunctionBegin;
934*d52a580bSJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLHIP_Destroy((Mat_SeqSELLHIP **)&A->spptr));
935*d52a580bSJunchao Zhang   PetscCall(MatDestroy_SeqSELL(A));
936*d52a580bSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
937*d52a580bSJunchao Zhang }
938*d52a580bSJunchao Zhang 
939*d52a580bSJunchao Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat);
MatDuplicate_SeqSELLHIP(Mat A,MatDuplicateOption cpvalues,Mat * B)940*d52a580bSJunchao Zhang static PetscErrorCode       MatDuplicate_SeqSELLHIP(Mat A, MatDuplicateOption cpvalues, Mat *B)
941*d52a580bSJunchao Zhang {
942*d52a580bSJunchao Zhang   PetscFunctionBegin;
943*d52a580bSJunchao Zhang   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
944*d52a580bSJunchao Zhang   PetscCall(MatConvert_SeqSELL_SeqSELLHIP(*B));
945*d52a580bSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
946*d52a580bSJunchao Zhang }
947*d52a580bSJunchao Zhang 
MatConvert_SeqSELL_SeqSELLHIP(Mat B)948*d52a580bSJunchao Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat B)
949*d52a580bSJunchao Zhang {
950*d52a580bSJunchao Zhang   Mat_SeqSELLHIP *hipstruct;
951*d52a580bSJunchao Zhang 
952*d52a580bSJunchao Zhang   PetscFunctionBegin;
953*d52a580bSJunchao Zhang   PetscCall(PetscFree(B->defaultvectype));
954*d52a580bSJunchao Zhang   PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
955*d52a580bSJunchao Zhang 
956*d52a580bSJunchao Zhang   if (!B->spptr) {
957*d52a580bSJunchao Zhang     if (B->factortype == MAT_FACTOR_NONE) {
958*d52a580bSJunchao Zhang       PetscCall(PetscNew(&hipstruct));
959*d52a580bSJunchao Zhang       B->spptr = hipstruct;
960*d52a580bSJunchao Zhang     }
961*d52a580bSJunchao Zhang   }
962*d52a580bSJunchao Zhang 
963*d52a580bSJunchao Zhang   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLHIP;
964*d52a580bSJunchao Zhang   B->ops->destroy        = MatDestroy_SeqSELLHIP;
965*d52a580bSJunchao Zhang   B->ops->setfromoptions = MatSetFromOptions_SeqSELLHIP;
966*d52a580bSJunchao Zhang   B->ops->mult           = MatMult_SeqSELLHIP;
967*d52a580bSJunchao Zhang   B->ops->multadd        = MatMultAdd_SeqSELLHIP;
968*d52a580bSJunchao Zhang   B->ops->duplicate      = MatDuplicate_SeqSELLHIP;
969*d52a580bSJunchao Zhang   B->ops->zeroentries    = MatZeroEntries_SeqSELLHIP;
970*d52a580bSJunchao Zhang 
971*d52a580bSJunchao Zhang   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
972*d52a580bSJunchao Zhang   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
973*d52a580bSJunchao Zhang 
974*d52a580bSJunchao Zhang   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLHIP));
975*d52a580bSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
976*d52a580bSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
977*d52a580bSJunchao Zhang }
978*d52a580bSJunchao Zhang 
979*d52a580bSJunchao Zhang /*MC
980*d52a580bSJunchao Zhang   MATSEQSELLHIP - MATSELLHIP = "(seq)sellhip" - A matrix type to be used for sparse matrices on AMD GPUs.
981*d52a580bSJunchao Zhang 
982*d52a580bSJunchao Zhang   Options Database Keys:
983*d52a580bSJunchao Zhang +  -mat_type seqsellhip - sets the matrix type to "seqsellhip" during a call to `MatSetFromOptions()`
984*d52a580bSJunchao Zhang .  -mat_sell_spmv_hip_kernel - selects a spmv kernel for MatSELLHIP
985*d52a580bSJunchao Zhang -  -mat_sell_spmv_hip_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension equal to the wrap size (normally 64 for AMD GPUs)
986*d52a580bSJunchao Zhang 
987*d52a580bSJunchao Zhang   Level: beginner
988*d52a580bSJunchao Zhang 
989*d52a580bSJunchao Zhang .seealso: [](ch_matrices), `Mat`, `MATSELLHIP`
990*d52a580bSJunchao Zhang M*/
991*d52a580bSJunchao Zhang 
MatCreate_SeqSELLHIP(Mat B)992*d52a580bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLHIP(Mat B)
993*d52a580bSJunchao Zhang {
994*d52a580bSJunchao Zhang   PetscFunctionBegin;
995*d52a580bSJunchao Zhang   PetscCall(MatCreate_SeqSELL(B));
996*d52a580bSJunchao Zhang   PetscCall(MatConvert_SeqSELL_SeqSELLHIP(B));
997*d52a580bSJunchao Zhang   PetscCall(MatSetFromOptions(B));
998*d52a580bSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
999*d52a580bSJunchao Zhang }
1000