xref: /petsc/src/mat/impls/sell/seq/seqhip/sellhip.hip.cxx (revision daba9d70159ea2f6905738fcbec7404635487b2b)
1 #include "hip/hip_runtime.h"
2 #include <hip/hip_runtime.h>
3 
4 #include <petscdevice_hip.h>
5 #include <petsc/private/cupmatomics.hpp>
6 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
7 
8 #define WARP_SIZE 64
9 
10 typedef struct {
11   PetscInt   maxallocmat;
12   PetscInt   totalentries;
13   PetscInt  *colidx; /* column index array, device pointer */
14   MatScalar *val;    /* value array, device pointer */
15   PetscInt   totalslices;
16   PetscInt  *sliidx; /* slice index array, device pointer */
17   PetscInt   nonzerostate;
18   PetscInt   kernelchoice;
19   PetscInt   blocky;
20   PetscInt   chunksperblock;
21   PetscInt   totalchunks;
22   PetscInt  *chunk_slice_map; /* starting slice for each chunk, device pointer */
23 } Mat_SeqSELLHIP;
24 
MatSeqSELLHIP_Destroy(Mat_SeqSELLHIP ** hipstruct)25 static PetscErrorCode MatSeqSELLHIP_Destroy(Mat_SeqSELLHIP **hipstruct)
26 {
27   PetscFunctionBegin;
28   if (*hipstruct) {
29     if ((*hipstruct)->colidx) PetscCallHIP(hipFree((*hipstruct)->colidx));
30     if ((*hipstruct)->val) PetscCallHIP(hipFree((*hipstruct)->val));
31     if ((*hipstruct)->sliidx) PetscCallHIP(hipFree((*hipstruct)->sliidx));
32     if ((*hipstruct)->chunk_slice_map) PetscCallHIP(hipFree((*hipstruct)->chunk_slice_map));
33     PetscCall(PetscFree(*hipstruct));
34   }
35   PetscFunctionReturn(PETSC_SUCCESS);
36 }
37 
MatSeqSELLHIPCopyToGPU(Mat A)38 static PetscErrorCode MatSeqSELLHIPCopyToGPU(Mat A)
39 {
40   Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
41   Mat_SeqSELL    *a         = (Mat_SeqSELL *)A->data;
42 
43   PetscFunctionBegin;
44   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
45     PetscCall(PetscLogEventBegin(MAT_HIPCopyToGPU, A, 0, 0, 0));
46     if (A->assembled && A->nonzerostate == hipstruct->nonzerostate) {
47       /* copy values only */
48       PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), hipMemcpyHostToDevice));
49       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
50     } else {
51       if (hipstruct->colidx) PetscCallHIP(hipFree(hipstruct->colidx));
52       if (hipstruct->val) PetscCallHIP(hipFree(hipstruct->val));
53       if (hipstruct->sliidx) PetscCallHIP(hipFree(hipstruct->sliidx));
54       if (hipstruct->chunk_slice_map) PetscCallHIP(hipFree(hipstruct->chunk_slice_map));
55       hipstruct->maxallocmat  = a->maxallocmat;
56       hipstruct->totalentries = a->sliidx[a->totalslices];
57       hipstruct->totalslices  = a->totalslices;
58       hipstruct->totalchunks  = a->totalchunks;
59       PetscCallHIP(hipMalloc((void **)&hipstruct->colidx, a->maxallocmat * sizeof(*hipstruct->colidx)));
60       PetscCallHIP(hipMalloc((void **)&hipstruct->val, a->maxallocmat * sizeof(*hipstruct->val)));
61       /* copy values, nz or maxallocmat? */
62       PetscCallHIP(hipMemcpy(hipstruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), hipMemcpyHostToDevice));
63       PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), hipMemcpyHostToDevice));
64 
65       PetscCallHIP(hipMalloc((void **)&hipstruct->sliidx, (a->totalslices + 1) * sizeof(*hipstruct->sliidx)));
66       PetscCallHIP(hipMemcpy(hipstruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), hipMemcpyHostToDevice));
67       PetscCallHIP(hipMalloc((void **)&hipstruct->chunk_slice_map, a->totalchunks * sizeof(*hipstruct->chunk_slice_map)));
68       PetscCallHIP(hipMemcpy(hipstruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), hipMemcpyHostToDevice));
69       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
70     }
71     PetscCallHIP(WaitForHIP());
72     PetscCall(PetscLogEventEnd(MAT_HIPCopyToGPU, A, 0, 0, 0));
73     A->offloadmask = PETSC_OFFLOAD_BOTH;
74   }
75   PetscFunctionReturn(PETSC_SUCCESS);
76 }
77 
matmult_seqsell_basic_kernel(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)78 static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
79 {
80   PetscInt  i, row, slice_id, row_in_slice;
81   MatScalar sum;
82   /* one thread per row. */
83   row = blockIdx.x * blockDim.x + threadIdx.x;
84   if (row < nrows) {
85     slice_id     = row / sliceheight;
86     row_in_slice = row % sliceheight;
87     sum          = 0.0;
88     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
89     y[row] = sum;
90   }
91 }
92 
matmultadd_seqsell_basic_kernel(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)93 static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
94 {
95   PetscInt  i, row, slice_id, row_in_slice;
96   MatScalar sum;
97   /* one thread per row. */
98   row = blockIdx.x * blockDim.x + threadIdx.x;
99   if (row < nrows) {
100     slice_id     = row / sliceheight;
101     row_in_slice = row % sliceheight;
102     sum          = 0.0;
103     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
104     z[row] = y[row] + sum;
105   }
106 }
107 
108 #if !defined(PETSC_USE_COMPLEX)
109 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wpass-failed")
110 /* use 1 block per slice, suitable for large slice width */
111 template <int BLOCKY>
matmult_seqsell_tiled_kernel9(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)112 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
113 {
114   __shared__ MatScalar shared[WARP_SIZE][BLOCKY];
115   PetscInt             i, row, slice_id = blockIdx.x;
116   int                  tid = threadIdx.x + threadIdx.y * WARP_SIZE;
117   /* transposed index */
118   int         tidx = tid % BLOCKY;
119   int         tidy = tid / BLOCKY;
120   PetscScalar t    = 0.0;
121 
122   row = slice_id * sliceheight + threadIdx.x % sliceheight;
123   if (row < nrows) {
124     for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]];
125   }
126   #pragma unroll
127   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
128   /* transpose layout to reduce each row using warp shfl */
129   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
130   __syncthreads();
131   if (tidy < sliceheight) t = shared[tidy][tidx];
132   #pragma unroll
133   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY);
134   if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
135   __syncthreads();
136   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x];
137 }
138 
139 /* use 1 block per slice, suitable for large slice width */
140 template <int BLOCKY>
matmultadd_seqsell_tiled_kernel9(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)141 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
142 {
143   __shared__ MatScalar shared[WARP_SIZE][BLOCKY];
144   PetscInt             i, row, slice_id = blockIdx.x;
145   int                  tid = threadIdx.x + threadIdx.y * WARP_SIZE;
146   /* transposed index */
147   int         tidx = tid % BLOCKY;
148   int         tidy = tid / BLOCKY;
149   PetscScalar t    = 0.0;
150 
151   row = slice_id * sliceheight + threadIdx.x % sliceheight;
152   if (row < nrows) {
153     for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]];
154   }
155   #pragma unroll
156   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
157   /* transpose layout to reduce each row using warp shfl */
158   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
159   __syncthreads();
160   if (tidy < sliceheight) t = shared[tidy][tidx];
161   #pragma unroll
162   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY);
163   if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
164   __syncthreads();
165   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][threadIdx.x];
166 }
167 
168 template <int BLOCKY>
segment_scan(PetscInt flag[],MatScalar shared[],PetscScalar * val)169 __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
170 {
171   bool head = true;
172   #pragma unroll
173   for (int i = 1; i < BLOCKY * 2; i <<= 1) {
174     int halfwarpid                                = threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2);
175     shared[threadIdx.x + threadIdx.y * WARP_SIZE] = 0;
176     if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
177       shared[threadIdx.x + threadIdx.y * WARP_SIZE] = *val;
178       if (i == 1) head = false;
179     }
180     __syncthreads();
181     if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * WARP_SIZE + i * WARP_SIZE];
182     __syncthreads();
183   }
184   return head;
185 }
186 
187 /* load-balancing version. Chunksize is equal to the number of threads per block */
188 template <int BLOCKY>
matmult_seqsell_tiled_kernel8(PetscInt nrows,PetscInt sliceheight,PetscInt chunksperblock,PetscInt totalchunks,const PetscInt * chunk_slice_map,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)189 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
190 {
191   __shared__ MatScalar shared[BLOCKY * WARP_SIZE];
192   PetscInt             gid, row, start_slice, cid;
193   PetscScalar          t = 0.0;
194   AtomicAdd<MatScalar> atomAdd;
195   /* zero out y */
196   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) {
197     gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
198     if (gid < nrows) y[gid] = 0.0;
199   }
200   for (int iter = 0; iter < chunksperblock; iter++) {
201     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
202     if (cid < totalchunks) {
203       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
204       gid         = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
205       if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
206         __shared__ PetscInt flag[BLOCKY * 2];
207         bool                write;
208         PetscInt            slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
209         /* find out the slice that this element belongs to */
210         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
211         if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id;
212         row = slice_id * sliceheight + threadIdx.x % sliceheight;
213         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
214         __syncthreads();
215         write = segment_scan<BLOCKY>(flag, shared, &t);
216         if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
217         t = 0.0;
218       } else { /* this iteration covers only one slice */
219         row = start_slice * sliceheight + threadIdx.x % sliceheight;
220         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
221         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
222           int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
223   /* reduction and write to output vector */
224   #pragma unroll
225           for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
226           /* transpose layout to reduce each row using warp shfl */
227           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
228           __syncthreads();
229           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
230   #pragma unroll
231           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY);
232           if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
233           __syncthreads();
234           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
235           t = 0.0;
236         }
237       }
238     }
239   }
240 }
241 
242 /* load-balancing version. Chunksize is equal to the number of threads per block */
243 template <int BLOCKY>
matmultadd_seqsell_tiled_kernel8(PetscInt nrows,PetscInt sliceheight,PetscInt chunksperblock,PetscInt totalchunks,const PetscInt * chunk_slice_map,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)244 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
245 {
246   __shared__ MatScalar shared[BLOCKY * WARP_SIZE];
247   PetscInt             gid, row, start_slice, cid;
248   PetscScalar          t = 0.0;
249   AtomicAdd<MatScalar> atomAdd;
250   /* copy y to z */
251   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) {
252     gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
253     if (gid < nrows) z[gid] = y[gid];
254   }
255   for (int iter = 0; iter < chunksperblock; iter++) {
256     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
257     if (cid < totalchunks) {
258       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
259       gid         = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
260       if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
261         __shared__ PetscInt flag[BLOCKY * 2];
262         bool                write;
263         PetscInt            slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
264         /* find out the slice that this element belongs to */
265         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
266         if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id;
267         row = slice_id * sliceheight + threadIdx.x % sliceheight;
268         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
269         __syncthreads();
270         write = segment_scan<BLOCKY>(flag, shared, &t);
271         if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
272         t = 0.0;
273       } else { /* this iteration covers only one slice */
274         row = start_slice * sliceheight + threadIdx.x % sliceheight;
275         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
276         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
277           int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
278   /* reduction and write to output vector */
279   #pragma unroll
280           for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
281           /* transpose layout to reduce each row using warp shfl */
282           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
283           __syncthreads();
284           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
285   #pragma unroll
286           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY);
287           if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
288           __syncthreads();
289           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
290           t = 0.0;
291         }
292       }
293     }
294   }
295 }
296 
297 /* use 1 warp per slice, suitable for small slice width */
matmult_seqsell_tiled_kernel7(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)298 static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
299 {
300   PetscInt i, row, slice_id;
301   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
302   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
303   double t = 0.0;
304   if (row < nrows) {
305     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]];
306   }
307   #pragma unroll
308   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
309   if (row < nrows && threadIdx.x < sliceheight) y[row] = t;
310 }
311 
312 /* use 1 warp per slice, suitable for small slice width */
matmultadd_seqsell_tiled_kernel7(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)313 static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
314 {
315   PetscInt i, row, slice_id;
316   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
317   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
318   double t = 0.0;
319   if (row < nrows) {
320     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]];
321   }
322   #pragma unroll
323   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
324   if (row < nrows && threadIdx.x < sliceheight) z[row] = y[row] + t;
325 }
PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()326 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
327 #endif
328 
329 /***********  Kernel 2-6 require a slice height smaller than 512, 256, 128, 64, 32, espectively. They are kept only for performance comparison  **********/
330 
331 static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
332 {
333   __shared__ MatScalar shared[32 * 16];
334   PetscInt             i, row, slice_id, row_in_slice;
335   /* multiple threads per row. */
336   row = blockIdx.x * blockDim.x + threadIdx.x;
337   if (row < nrows) {
338     slice_id     = row / sliceheight;
339     row_in_slice = row % sliceheight;
340 
341     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
342     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
343     __syncthreads();
344     if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
345     __syncthreads();
346     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
347     __syncthreads();
348     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
349     __syncthreads();
350     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
351     __syncthreads();
352     if (threadIdx.y < 1) {
353       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
354       y[row] = shared[threadIdx.x];
355     }
356   }
357 }
358 
matmult_seqsell_tiled_kernel5(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)359 static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
360 {
361   __shared__ MatScalar shared[32 * 16];
362   PetscInt             i, row, slice_id, row_in_slice;
363   /* multiple threads per row. */
364   row = blockIdx.x * blockDim.x + threadIdx.x;
365   if (row < nrows) {
366     slice_id     = row / sliceheight;
367     row_in_slice = row % sliceheight;
368 
369     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
370     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
371     __syncthreads();
372     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
373     __syncthreads();
374     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
375     __syncthreads();
376     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
377     __syncthreads();
378     if (threadIdx.y < 1) {
379       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
380       y[row] = shared[threadIdx.x];
381     }
382   }
383 }
384 
matmult_seqsell_tiled_kernel4(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)385 static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
386 {
387   __shared__ MatScalar shared[32 * 16];
388   PetscInt             i, row, slice_id, row_in_slice;
389   /* multiple threads per row. */
390   row = blockIdx.x * blockDim.x + threadIdx.x;
391   if (row < nrows) {
392     slice_id     = row / sliceheight;
393     row_in_slice = row % sliceheight;
394 
395     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
396     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
397     __syncthreads();
398     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
399     __syncthreads();
400     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
401     __syncthreads();
402     if (threadIdx.y < 1) {
403       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
404       y[row] = shared[threadIdx.x];
405     }
406   }
407 }
408 
matmult_seqsell_tiled_kernel3(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)409 static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
410 {
411   __shared__ MatScalar shared[32 * 16];
412   PetscInt             i, row, slice_id, row_in_slice;
413   /* multiple threads per row. */
414   row = blockIdx.x * blockDim.x + threadIdx.x;
415   if (row < nrows) {
416     slice_id     = row / sliceheight;
417     row_in_slice = row % sliceheight;
418 
419     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
420     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
421     __syncthreads();
422     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
423     __syncthreads();
424     if (threadIdx.y < 1) {
425       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
426       y[row] = shared[threadIdx.x];
427     }
428   }
429 }
430 
matmult_seqsell_tiled_kernel2(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)431 static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
432 {
433   __shared__ MatScalar shared[32 * 16];
434   PetscInt             i, row, slice_id, row_in_slice;
435   /* multiple threads per row. */
436   row = blockIdx.x * blockDim.x + threadIdx.x;
437   if (row < nrows) {
438     slice_id     = row / sliceheight;
439     row_in_slice = row % sliceheight;
440 
441     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
442     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
443     __syncthreads();
444     if (threadIdx.y < 1) {
445       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
446       y[row] = shared[threadIdx.x];
447     }
448   }
449 }
450 
matmultadd_seqsell_tiled_kernel6(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)451 static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
452 {
453   __shared__ MatScalar shared[32 * 16];
454   PetscInt             i, row, slice_id, row_in_slice;
455   /* multiple threads per row. */
456   row = blockIdx.x * blockDim.x + threadIdx.x;
457   if (row < nrows) {
458     slice_id     = row / sliceheight;
459     row_in_slice = row % sliceheight;
460 
461     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
462     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
463     __syncthreads();
464     if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
465     __syncthreads();
466     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
467     __syncthreads();
468     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
469     __syncthreads();
470     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
471     __syncthreads();
472     if (threadIdx.y < 1) {
473       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
474       z[row] = y[row] + shared[threadIdx.x];
475     }
476   }
477 }
478 
matmultadd_seqsell_tiled_kernel5(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)479 static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
480 {
481   __shared__ MatScalar shared[32 * 16];
482   PetscInt             i, row, slice_id, row_in_slice;
483   /* multiple threads per row. */
484   row = blockIdx.x * blockDim.x + threadIdx.x;
485   if (row < nrows) {
486     slice_id     = row / sliceheight;
487     row_in_slice = row % sliceheight;
488 
489     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
490     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
491     __syncthreads();
492     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
493     __syncthreads();
494     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
495     __syncthreads();
496     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
497     __syncthreads();
498     if (threadIdx.y < 1) {
499       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
500       z[row] = y[row] + shared[threadIdx.x];
501     }
502   }
503 }
504 
matmultadd_seqsell_tiled_kernel4(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)505 static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
506 {
507   __shared__ MatScalar shared[32 * 16];
508   PetscInt             i, row, slice_id, row_in_slice;
509   /* multiple threads per row. */
510   row = blockIdx.x * blockDim.x + threadIdx.x;
511   if (row < nrows) {
512     slice_id     = row / sliceheight;
513     row_in_slice = row % sliceheight;
514 
515     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
516     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
517     __syncthreads();
518     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
519     __syncthreads();
520     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
521     __syncthreads();
522     if (threadIdx.y < 1) {
523       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
524       z[row] = y[row] + shared[threadIdx.x];
525     }
526   }
527 }
528 
matmultadd_seqsell_tiled_kernel3(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)529 static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
530 {
531   __shared__ MatScalar shared[32 * 16];
532   PetscInt             i, row, slice_id, row_in_slice;
533   /* multiple threads per row. */
534   row = blockIdx.x * blockDim.x + threadIdx.x;
535   if (row < nrows) {
536     slice_id     = row / sliceheight;
537     row_in_slice = row % sliceheight;
538 
539     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
540     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
541     __syncthreads();
542     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
543     __syncthreads();
544     if (threadIdx.y < 1) {
545       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
546       z[row] = y[row] + shared[threadIdx.x];
547     }
548   }
549 }
550 
matmultadd_seqsell_tiled_kernel2(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)551 static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
552 {
553   __shared__ MatScalar shared[32 * 16];
554   PetscInt             i, row, slice_id, row_in_slice;
555   /* multiple threads per row. */
556   row = blockIdx.x * blockDim.x + threadIdx.x;
557   if (row < nrows) {
558     slice_id     = row / sliceheight;
559     row_in_slice = row % sliceheight;
560 
561     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
562     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
563     __syncthreads();
564     if (threadIdx.y < 1) {
565       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
566       z[row] = y[row] + shared[threadIdx.x];
567     }
568   }
569 }
570 
MatMult_SeqSELLHIP(Mat A,Vec xx,Vec yy)571 static PetscErrorCode MatMult_SeqSELLHIP(Mat A, Vec xx, Vec yy)
572 {
573   Mat_SeqSELL       *a         = (Mat_SeqSELL *)A->data;
574   Mat_SeqSELLHIP    *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
575   PetscScalar       *y;
576   const PetscScalar *x;
577   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
578   MatScalar         *aval;
579   PetscInt          *acolidx;
580   PetscInt          *sliidx;
581   PetscInt           nblocks, blocksize = 512; /* blocksize is fixed to be 512 */
582   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
583 #if !defined(PETSC_USE_COMPLEX)
584   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
585   PetscReal maxoveravg;
586 #endif
587 
588   PetscFunctionBegin;
589   PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
590   PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight > 32), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be less than 32, but the current slice height is %" PetscInt_FMT, sliceheight);
591   PetscCall(MatSeqSELLHIPCopyToGPU(A));
592   /* hipstruct may not be available until MatSeqSELLHIPCopyToGPU() is called */
593   aval    = hipstruct->val;
594   acolidx = hipstruct->colidx;
595   sliidx  = hipstruct->sliidx;
596 
597   PetscCall(VecHIPGetArrayRead(xx, &x));
598   PetscCall(VecHIPGetArrayWrite(yy, &y));
599   PetscCall(PetscLogGpuTimeBegin());
600 
601   switch (hipstruct->kernelchoice) {
602 #if !defined(PETSC_USE_COMPLEX)
603   case 9: /* 1 slice per block */
604     nblocks = 1 + (nrows - 1) / sliceheight;
605     if (hipstruct->blocky == 2) {
606       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
607     } else if (hipstruct->blocky == 4) {
608       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
609     } else if (hipstruct->blocky == 8) {
610       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
611     } else if (hipstruct->blocky == 16) {
612       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
613     } else {
614       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
615     }
616     break;
617   case 7: /* each block handles blocky slices */
618     nblocks = 1 + (nrows - 1) / (hipstruct->blocky * sliceheight);
619     if (hipstruct->blocky == 2) {
620       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
621     } else if (hipstruct->blocky == 4) {
622       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
623     } else if (hipstruct->blocky == 8) {
624       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
625     } else if (hipstruct->blocky == 16) {
626       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
627     } else {
628       nblocks = 1 + (nrows - 1) / (2 * sliceheight);
629       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
630     }
631     break;
632 #endif
633   case 6:
634     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if sliceheight=32 */
635     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
636     break;
637   case 5:
638     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if sliceheight=32*/
639     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
640     break;
641   case 4:
642     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if sliceheight=32 */
643     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
644     break;
645   case 3:
646     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if sliceheight=32 */
647     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
648     break;
649   case 2: /* 16 slices per block if sliceheight=32 */
650     nblocks = 1 + (nrows - 1) / (blocksize / 2);
651     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
652     break;
653   case 1: /* 32 slices per block if sliceheight=32 */
654     nblocks = 1 + (nrows - 1) / blocksize;
655     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
656     break;
657 #if !defined(PETSC_USE_COMPLEX)
658   case 0:
659     maxoveravg = a->maxslicewidth / a->avgslicewidth;
660     if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
661       /* each block handles approximately one slice */
662       PetscInt blocky = a->chunksize / 32;
663       nchunks         = hipstruct->totalchunks;
664       chunksperblock  = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
665       nblocks         = 1 + (nchunks - 1) / chunksperblock;
666       chunk_slice_map = hipstruct->chunk_slice_map;
667       if (blocky == 2) {
668         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
669       } else if (blocky == 4) {
670         matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
671       } else if (blocky == 8) {
672         matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
673       } else if (blocky == 16) {
674         matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
675       } else {
676         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
677       }
678     } else {
679       PetscInt avgslicesize = sliceheight * a->avgslicewidth;
680       if (avgslicesize <= 432) {
681         if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
682           nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
683           matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
684         } else {
685           nblocks = 1 + (nrows - 1) / sliceheight;
686           matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
687         }
688       } else if (avgslicesize <= 2400) {
689         nblocks = 1 + (nrows - 1) / sliceheight;
690         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
691       } else {
692         nblocks = 1 + (nrows - 1) / sliceheight;
693         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
694       }
695     }
696     break;
697 #endif
698   default:
699     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice);
700   }
701   PetscCall(PetscLogGpuTimeEnd());
702   PetscCall(VecHIPRestoreArrayRead(xx, &x));
703   PetscCall(VecHIPRestoreArrayWrite(yy, &y));
704   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
705   PetscFunctionReturn(PETSC_SUCCESS);
706 }
707 
MatMultAdd_SeqSELLHIP(Mat A,Vec xx,Vec yy,Vec zz)708 static PetscErrorCode MatMultAdd_SeqSELLHIP(Mat A, Vec xx, Vec yy, Vec zz)
709 {
710   Mat_SeqSELL       *a         = (Mat_SeqSELL *)A->data;
711   Mat_SeqSELLHIP    *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
712   PetscScalar       *z;
713   const PetscScalar *y, *x;
714   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
715   MatScalar         *aval    = hipstruct->val;
716   PetscInt          *acolidx = hipstruct->colidx;
717   PetscInt          *sliidx  = hipstruct->sliidx;
718 #if !defined(PETSC_USE_COMPLEX)
719   PetscReal maxoveravg;
720   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
721   PetscInt  blocky = hipstruct->blocky;
722 #endif
723 
724   PetscFunctionBegin;
725   PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
726   PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight != sliceheight), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
727   PetscCall(MatSeqSELLHIPCopyToGPU(A));
728   if (a->nz) {
729     PetscInt nblocks, blocksize = 512;
730     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
731     PetscCall(VecHIPGetArrayRead(xx, &x));
732     PetscCall(VecHIPGetArrayRead(yy, &y));
733     PetscCall(VecHIPGetArrayWrite(zz, &z));
734     PetscCall(PetscLogGpuTimeBegin());
735 
736     switch (hipstruct->kernelchoice) {
737 #if !defined(PETSC_USE_COMPLEX)
738     case 9:
739       nblocks = 1 + (nrows - 1) / sliceheight;
740       if (blocky == 2) {
741         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
742       } else if (blocky == 4) {
743         matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
744       } else if (blocky == 8) {
745         matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
746       } else if (blocky == 16) {
747         matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
748       } else {
749         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
750       }
751       break;
752     case 8:
753       /* each block handles approximately one slice */
754       nchunks         = hipstruct->totalchunks;
755       blocky          = a->chunksize / 32;
756       chunksperblock  = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
757       nblocks         = 1 + (nchunks - 1) / chunksperblock;
758       chunk_slice_map = hipstruct->chunk_slice_map;
759       if (blocky == 2) {
760         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
761       } else if (blocky == 4) {
762         matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
763       } else if (blocky == 8) {
764         matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
765       } else if (blocky == 16) {
766         matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
767       } else {
768         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
769       }
770       break;
771     case 7:
772       nblocks = 1 + (nrows - 1) / (blocky * sliceheight);
773       if (blocky == 2) {
774         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
775       } else if (blocky == 4) {
776         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
777       } else if (blocky == 8) {
778         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
779       } else if (blocky == 16) {
780         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
781       } else {
782         nblocks = 1 + (nrows - 1) / (2 * sliceheight);
783         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
784       }
785       break;
786 #endif
787     case 6:
788       nblocks = 1 + (nrows - 1) / (blocksize / 32);
789       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
790       break;
791     case 5:
792       nblocks = 1 + (nrows - 1) / (blocksize / 16);
793       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
794       break;
795     case 4:
796       nblocks = 1 + (nrows - 1) / (blocksize / 8);
797       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
798       break;
799     case 3:
800       nblocks = 1 + (nrows - 1) / (blocksize / 4);
801       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
802       break;
803     case 2:
804       nblocks = 1 + (nrows - 1) / (blocksize / 2);
805       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
806       break;
807     case 1:
808       nblocks = 1 + (nrows - 1) / blocksize;
809       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
810       break;
811 #if !defined(PETSC_USE_COMPLEX)
812     case 0:
813       maxoveravg = a->maxslicewidth / a->avgslicewidth;
814       if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
815         /* each block handles approximately one slice */
816         nchunks         = hipstruct->totalchunks;
817         blocky          = a->chunksize / 32;
818         chunksperblock  = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
819         nblocks         = 1 + (nchunks - 1) / chunksperblock;
820         chunk_slice_map = hipstruct->chunk_slice_map;
821         if (blocky == 2) {
822           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
823         } else if (blocky == 4) {
824           matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
825         } else if (blocky == 8) {
826           matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
827         } else if (blocky == 16) {
828           matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
829         } else {
830           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
831         }
832       } else {
833         PetscInt avgslicesize = sliceheight * a->avgslicewidth;
834         if (avgslicesize <= 432) {
835           if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
836             nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
837             matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
838           } else {
839             nblocks = 1 + (nrows - 1) / sliceheight;
840             matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
841           }
842         } else if (avgslicesize <= 2400) {
843           nblocks = 1 + (nrows - 1) / sliceheight;
844           matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
845         } else {
846           nblocks = 1 + (nrows - 1) / sliceheight;
847           matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
848         }
849       }
850       break;
851 #endif
852     default:
853       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice);
854     }
855     PetscCall(PetscLogGpuTimeEnd());
856     PetscCall(VecHIPRestoreArrayRead(xx, &x));
857     PetscCall(VecHIPRestoreArrayRead(yy, &y));
858     PetscCall(VecHIPRestoreArrayWrite(zz, &z));
859     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
860   } else {
861     PetscCall(VecCopy(yy, zz));
862   }
863   PetscFunctionReturn(PETSC_SUCCESS);
864 }
865 
MatSetFromOptions_SeqSELLHIP(Mat A,PetscOptionItems PetscOptionsObject)866 static PetscErrorCode MatSetFromOptions_SeqSELLHIP(Mat A, PetscOptionItems PetscOptionsObject)
867 {
868   Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
869   PetscInt        kernel, blocky;
870   PetscBool       flg;
871 
872   PetscFunctionBegin;
873   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLHIP options");
874   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_blocky", &blocky, &flg));
875   if (flg) {
876     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
877     hipstruct->blocky = blocky;
878   }
879   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_kernel", &kernel, &flg));
880   if (flg) {
881     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
882     hipstruct->kernelchoice = kernel;
883     if (kernel == 8) PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_chunksperblock", &hipstruct->chunksperblock, &flg));
884   }
885   PetscOptionsHeadEnd();
886   PetscFunctionReturn(PETSC_SUCCESS);
887 }
888 
MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)889 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
890 {
891   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
892 
893   PetscFunctionBegin;
894   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
895   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
896   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
897   PetscFunctionReturn(PETSC_SUCCESS);
898 }
899 
MatAssemblyEnd_SeqSELLHIP(Mat A,MatAssemblyType mode)900 static PetscErrorCode MatAssemblyEnd_SeqSELLHIP(Mat A, MatAssemblyType mode)
901 {
902   PetscFunctionBegin;
903   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
904   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
905   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
906   if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqSELLHIPCopyToGPU(A));
907   A->ops->mult    = MatMult_SeqSELLHIP;
908   A->ops->multadd = MatMultAdd_SeqSELLHIP;
909   PetscFunctionReturn(PETSC_SUCCESS);
910 }
911 
MatZeroEntries_SeqSELLHIP(Mat A)912 static PetscErrorCode MatZeroEntries_SeqSELLHIP(Mat A)
913 {
914   PetscBool    both = PETSC_FALSE;
915   Mat_SeqSELL *a    = (Mat_SeqSELL *)A->data;
916 
917   PetscFunctionBegin;
918   if (A->factortype == MAT_FACTOR_NONE) {
919     Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
920     if (hipstruct->val) {
921       both = PETSC_TRUE;
922       PetscCallHIP(hipMemset(hipstruct->val, 0, a->sliidx[a->totalslices] * sizeof(*hipstruct->val)));
923     }
924   }
925   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
926   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
927   else A->offloadmask = PETSC_OFFLOAD_CPU;
928   PetscFunctionReturn(PETSC_SUCCESS);
929 }
930 
MatDestroy_SeqSELLHIP(Mat A)931 static PetscErrorCode MatDestroy_SeqSELLHIP(Mat A)
932 {
933   PetscFunctionBegin;
934   if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLHIP_Destroy((Mat_SeqSELLHIP **)&A->spptr));
935   PetscCall(MatDestroy_SeqSELL(A));
936   PetscFunctionReturn(PETSC_SUCCESS);
937 }
938 
939 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat);
MatDuplicate_SeqSELLHIP(Mat A,MatDuplicateOption cpvalues,Mat * B)940 static PetscErrorCode       MatDuplicate_SeqSELLHIP(Mat A, MatDuplicateOption cpvalues, Mat *B)
941 {
942   PetscFunctionBegin;
943   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
944   PetscCall(MatConvert_SeqSELL_SeqSELLHIP(*B));
945   PetscFunctionReturn(PETSC_SUCCESS);
946 }
947 
MatConvert_SeqSELL_SeqSELLHIP(Mat B)948 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat B)
949 {
950   Mat_SeqSELLHIP *hipstruct;
951 
952   PetscFunctionBegin;
953   PetscCall(PetscFree(B->defaultvectype));
954   PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
955 
956   if (!B->spptr) {
957     if (B->factortype == MAT_FACTOR_NONE) {
958       PetscCall(PetscNew(&hipstruct));
959       B->spptr = hipstruct;
960     }
961   }
962 
963   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLHIP;
964   B->ops->destroy        = MatDestroy_SeqSELLHIP;
965   B->ops->setfromoptions = MatSetFromOptions_SeqSELLHIP;
966   B->ops->mult           = MatMult_SeqSELLHIP;
967   B->ops->multadd        = MatMultAdd_SeqSELLHIP;
968   B->ops->duplicate      = MatDuplicate_SeqSELLHIP;
969   B->ops->zeroentries    = MatZeroEntries_SeqSELLHIP;
970 
971   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
972   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
973 
974   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLHIP));
975   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
976   PetscFunctionReturn(PETSC_SUCCESS);
977 }
978 
979 /*MC
980   MATSEQSELLHIP - MATSELLHIP = "(seq)sellhip" - A matrix type to be used for sparse matrices on AMD GPUs.
981 
982   Options Database Keys:
983 +  -mat_type seqsellhip - sets the matrix type to "seqsellhip" during a call to `MatSetFromOptions()`
984 .  -mat_sell_spmv_hip_kernel - selects a spmv kernel for MatSELLHIP
985 -  -mat_sell_spmv_hip_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension equal to the wrap size (normally 64 for AMD GPUs)
986 
987   Level: beginner
988 
989 .seealso: [](ch_matrices), `Mat`, `MATSELLHIP`
990 M*/
991 
MatCreate_SeqSELLHIP(Mat B)992 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLHIP(Mat B)
993 {
994   PetscFunctionBegin;
995   PetscCall(MatCreate_SeqSELL(B));
996   PetscCall(MatConvert_SeqSELL_SeqSELLHIP(B));
997   PetscCall(MatSetFromOptions(B));
998   PetscFunctionReturn(PETSC_SUCCESS);
999 }
1000