xref: /petsc/src/mat/impls/sell/seq/seqcuda/sellcuda.cu (revision f13dfd9ea68e0ddeee984e65c377a1819eab8a8a)
1 #include <cuda_runtime.h>
2 
3 #include <petscdevice_cuda.h>
4 #include <petsc/private/cupmatomics.hpp>
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 
7 #define SLICE_HEIGHT 16
8 
9 typedef struct {
10   PetscInt   maxallocmat;
11   PetscInt   totalentries;
12   PetscInt  *colidx; /* column index array, device pointer */
13   MatScalar *val;    /* value array, device pointer */
14   PetscInt   totalslices;
15   PetscInt  *sliidx; /* slice index array, device pointer */
16   PetscInt   nonzerostate;
17   PetscInt   kernelchoice;
18   PetscInt   blocky;
19   PetscInt   chunksperblock;
20   PetscInt   totalchunks;
21   PetscInt  *chunk_slice_map; /* starting slice for each chunk, device pointer */
22 } Mat_SeqSELLCUDA;
23 
24 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
25 {
26   PetscFunctionBegin;
27   if (*cudastruct) {
28     if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); }
29     if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); }
30     if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); }
31     if ((*cudastruct)->chunk_slice_map) { PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map)); }
32     PetscCall(PetscFree(*cudastruct));
33   }
34   PetscFunctionReturn(PETSC_SUCCESS);
35 }
36 
37 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
38 {
39   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
40   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;
41 
42   PetscFunctionBegin;
43   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
44     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
45     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
46       /* copy values only */
47       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
48       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
49     } else {
50       if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); }
51       if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); }
52       if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); }
53       if (cudastruct->chunk_slice_map) { PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map)); }
54       cudastruct->maxallocmat  = a->maxallocmat;
55       cudastruct->totalentries = a->sliidx[a->totalslices];
56       cudastruct->totalslices  = a->totalslices;
57       cudastruct->totalchunks  = a->totalchunks;
58       PetscCallCUDA(cudaMalloc((void **)&cudastruct->colidx, a->maxallocmat * sizeof(*cudastruct->colidx)));
59       PetscCallCUDA(cudaMalloc((void **)&cudastruct->val, a->maxallocmat * sizeof(*cudastruct->val)));
60       /* copy values, nz or maxallocmat? */
61       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), cudaMemcpyHostToDevice));
62       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), cudaMemcpyHostToDevice));
63 
64       PetscCallCUDA(cudaMalloc((void **)&cudastruct->sliidx, (a->totalslices + 1) * sizeof(*cudastruct->sliidx)));
65       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), cudaMemcpyHostToDevice));
66       PetscCallCUDA(cudaMalloc((void **)&cudastruct->chunk_slice_map, a->totalchunks * sizeof(*cudastruct->chunk_slice_map)));
67       PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), cudaMemcpyHostToDevice));
68       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
69     }
70     PetscCallCUDA(WaitForCUDA());
71     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
72     A->offloadmask = PETSC_OFFLOAD_BOTH;
73   }
74   PetscFunctionReturn(PETSC_SUCCESS);
75 }
76 
77 static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
78 {
79   PetscInt  i, row, slice_id, row_in_slice;
80   MatScalar sum;
81   /* one thread per row. */
82   row = blockIdx.x * blockDim.x + threadIdx.x;
83   if (row < nrows) {
84     slice_id     = row / sliceheight;
85     row_in_slice = row % sliceheight;
86     sum          = 0.0;
87     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
88     y[row] = sum;
89   }
90 }
91 
92 static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
93 {
94   PetscInt  i, row, slice_id, row_in_slice;
95   MatScalar sum;
96   /* one thread per row. */
97   row = blockIdx.x * blockDim.x + threadIdx.x;
98   if (row < nrows) {
99     slice_id     = row / sliceheight;
100     row_in_slice = row % sliceheight;
101     sum          = 0.0;
102     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
103     z[row] = y[row] + sum;
104   }
105 }
106 
107 #if !defined(PETSC_USE_COMPLEX)
108 /* use 1 block per slice, suitable for large slice width */
109 template <int BLOCKY>
110 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
111 {
112   __shared__ MatScalar shared[32][BLOCKY];
113   PetscInt             i, row, slice_id = blockIdx.x;
114   int                  tid = threadIdx.x + threadIdx.y * 32;
115   /* transposed index */
116   int         tidx = tid % BLOCKY;
117   int         tidy = tid / BLOCKY;
118   PetscScalar t    = 0.0;
119 
120   row = slice_id * sliceheight + threadIdx.x % sliceheight;
121   if (row < nrows) {
122     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
123   }
124   #pragma unroll
125   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
126   /* transpose layout to reduce each row using warp shfl */
127   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
128   __syncthreads();
129   if (tidy < sliceheight) t = shared[tidy][tidx];
130   #pragma unroll
131   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
132   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
133   __syncthreads();
134   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; }
135 }
136 
137 /* use 1 block per slice, suitable for large slice width */
138 template <int BLOCKY>
139 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
140 {
141   __shared__ MatScalar shared[32][BLOCKY];
142   PetscInt             i, row, slice_id = blockIdx.x;
143   int                  tid = threadIdx.x + threadIdx.y * 32;
144   /* transposed index */
145   int         tidx = tid % BLOCKY;
146   int         tidy = tid / BLOCKY;
147   PetscScalar t    = 0.0;
148 
149   row = slice_id * sliceheight + threadIdx.x % sliceheight;
150   if (row < nrows) {
151     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
152   }
153   #pragma unroll
154   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
155   /* transpose layout to reduce each row using warp shfl */
156   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
157   __syncthreads();
158   if (tidy < sliceheight) t = shared[tidy][tidx];
159   #pragma unroll
160   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
161   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
162   __syncthreads();
163   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; }
164 }
165 
166 template <int BLOCKY>
167 __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
168 {
169   bool head = true;
170   #pragma unroll
171   for (int i = 1; i < BLOCKY * 2; i <<= 1) {
172     int halfwarpid                         = threadIdx.y * 2 + threadIdx.x / 16;
173     shared[threadIdx.x + threadIdx.y * 32] = 0;
174     if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
175       shared[threadIdx.x + threadIdx.y * 32] = *val;
176       if (i == 1) head = false;
177     }
178     __syncthreads();
179     if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16];
180     __syncthreads();
181   }
182   return head;
183 }
184 
185 /* load-balancing version. Chunksize is equal to the number of threads per block */
186 template <int BLOCKY>
187 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
188 {
189   __shared__ MatScalar shared[BLOCKY * 32];
190   PetscInt             gid, row, start_slice, cid;
191   PetscScalar          t = 0.0;
192   AtomicAdd<MatScalar> atomAdd;
193   /* zero out y */
194   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
195     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
196     if (gid < nrows) y[gid] = 0.0;
197   }
198   for (int iter = 0; iter < chunksperblock; iter++) {
199     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
200     if (cid < totalchunks) {
201       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
202       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
203       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
204         __shared__ PetscInt flag[BLOCKY * 2];
205         bool                write;
206         PetscInt            slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
207         /* find out the slice that this element belongs to */
208         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
209         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
210         row = slice_id * sliceheight + threadIdx.x % sliceheight;
211         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
212         __syncthreads();
213         write = segment_scan<BLOCKY>(flag, shared, &t);
214         if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
215         t = 0.0;
216       } else { /* this iteration covers only one slice */
217         row = start_slice * sliceheight + threadIdx.x % sliceheight;
218         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
219         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
220           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
221   /* reduction and write to output vector */
222   #pragma unroll
223           for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
224           /* transpose layout to reduce each row using warp shfl */
225           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
226           __syncthreads();
227           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
228   #pragma unroll
229           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
230           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
231           __syncthreads();
232           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
233           t = 0.0;
234         }
235       }
236     }
237   }
238 }
239 
240 /* load-balancing version. Chunksize is equal to the number of threads per block */
241 template <int BLOCKY>
242 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
243 {
244   __shared__ MatScalar shared[BLOCKY * 32];
245   PetscInt             gid, row, start_slice, cid;
246   PetscScalar          t = 0.0;
247   AtomicAdd<MatScalar> atomAdd;
248   /* copy y to z */
249   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
250     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
251     if (gid < nrows) z[gid] = y[gid];
252   }
253   for (int iter = 0; iter < chunksperblock; iter++) {
254     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
255     if (cid < totalchunks) {
256       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
257       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
258       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
259         __shared__ PetscInt flag[BLOCKY * 2];
260         bool                write;
261         PetscInt            slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
262         /* find out the slice that this element belongs to */
263         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
264         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
265         row = slice_id * sliceheight + threadIdx.x % sliceheight;
266         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
267         __syncthreads();
268         write = segment_scan<BLOCKY>(flag, shared, &t);
269         if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
270         t = 0.0;
271       } else { /* this iteration covers only one slice */
272         row = start_slice * sliceheight + threadIdx.x % sliceheight;
273         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
274         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
275           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
276   /* reduction and write to output vector */
277   #pragma unroll
278           for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
279           /* transpose layout to reduce each row using warp shfl */
280           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
281           __syncthreads();
282           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
283   #pragma unroll
284           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
285           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
286           __syncthreads();
287           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
288           t = 0.0;
289         }
290       }
291     }
292   }
293 }
294 
295 /* use 1 warp per slice, suitable for small slice width */
296 static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
297 {
298   PetscInt i, row, slice_id;
299   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
300   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
301   double t = 0.0;
302   if (row < nrows) {
303     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
304   }
305   #pragma unroll
306   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
307   if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; }
308 }
309 
310 /* use 1 warp per slice, suitable for small slice width */
311 static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
312 {
313   PetscInt i, row, slice_id;
314   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
315   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
316   double t = 0.0;
317   if (row < nrows) {
318     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
319   }
320   #pragma unroll
321   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
322   if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; }
323 }
324 #endif
325 
326 /***********  Kernel 2-6  are tied to slice height 16. They are kept only for performance comparison  **********/
327 
328 static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
329 {
330   __shared__ MatScalar shared[512];
331   PetscInt             i, row, slice_id, row_in_slice;
332   /* multiple threads per row. */
333   row = blockIdx.x * blockDim.x + threadIdx.x;
334   if (row < nrows) {
335     slice_id     = row / SLICE_HEIGHT;
336     row_in_slice = row % SLICE_HEIGHT;
337 
338     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
339     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
340     __syncthreads();
341     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
342     __syncthreads();
343     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
344     __syncthreads();
345     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
346     __syncthreads();
347     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
348     __syncthreads();
349     if (threadIdx.y < 1) {
350       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
351       y[row] = shared[threadIdx.x];
352     }
353   }
354 }
355 
356 static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
357 {
358   __shared__ MatScalar shared[512];
359   PetscInt             i, row, slice_id, row_in_slice;
360   /* multiple threads per row. */
361   row = blockIdx.x * blockDim.x + threadIdx.x;
362   if (row < nrows) {
363     slice_id     = row / SLICE_HEIGHT;
364     row_in_slice = row % SLICE_HEIGHT;
365 
366     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
367     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
368     __syncthreads();
369     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
370     __syncthreads();
371     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
372     __syncthreads();
373     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
374     __syncthreads();
375     if (threadIdx.y < 1) {
376       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
377       y[row] = shared[threadIdx.x];
378     }
379   }
380 }
381 
382 static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
383 {
384   __shared__ MatScalar shared[512];
385   PetscInt             i, row, slice_id, row_in_slice;
386   /* multiple threads per row. */
387   row = blockIdx.x * blockDim.x + threadIdx.x;
388   if (row < nrows) {
389     slice_id     = row / SLICE_HEIGHT;
390     row_in_slice = row % SLICE_HEIGHT;
391 
392     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
393     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
394     __syncthreads();
395     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
396     __syncthreads();
397     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
398     __syncthreads();
399     if (threadIdx.y < 1) {
400       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
401       y[row] = shared[threadIdx.x];
402     }
403   }
404 }
405 
406 static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
407 {
408   __shared__ MatScalar shared[512];
409   PetscInt             i, row, slice_id, row_in_slice;
410   /* multiple threads per row. */
411   row = blockIdx.x * blockDim.x + threadIdx.x;
412   if (row < nrows) {
413     slice_id     = row / SLICE_HEIGHT;
414     row_in_slice = row % SLICE_HEIGHT;
415 
416     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
417     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
418     __syncthreads();
419     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
420     __syncthreads();
421     if (threadIdx.y < 1) {
422       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
423       y[row] = shared[threadIdx.x];
424     }
425   }
426 }
427 
428 static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
429 {
430   __shared__ MatScalar shared[512];
431   PetscInt             i, row, slice_id, row_in_slice;
432   /* multiple threads per row. */
433   row = blockIdx.x * blockDim.x + threadIdx.x;
434   if (row < nrows) {
435     slice_id     = row / SLICE_HEIGHT;
436     row_in_slice = row % SLICE_HEIGHT;
437 
438     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
439     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
440     __syncthreads();
441     if (threadIdx.y < 1) {
442       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
443       y[row] = shared[threadIdx.x];
444     }
445   }
446 }
447 
448 static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
449 {
450   __shared__ MatScalar shared[512];
451   PetscInt             i, row, slice_id, row_in_slice;
452   /* multiple threads per row. */
453   row = blockIdx.x * blockDim.x + threadIdx.x;
454   if (row < nrows) {
455     slice_id     = row / SLICE_HEIGHT;
456     row_in_slice = row % SLICE_HEIGHT;
457 
458     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
459     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
460     __syncthreads();
461     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
462     __syncthreads();
463     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
464     __syncthreads();
465     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
466     __syncthreads();
467     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
468     __syncthreads();
469     if (threadIdx.y < 1) {
470       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
471       z[row] = y[row] + shared[threadIdx.x];
472     }
473   }
474 }
475 
476 static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
477 {
478   __shared__ MatScalar shared[512];
479   PetscInt             i, row, slice_id, row_in_slice;
480   /* multiple threads per row. */
481   row = blockIdx.x * blockDim.x + threadIdx.x;
482   if (row < nrows) {
483     slice_id     = row / SLICE_HEIGHT;
484     row_in_slice = row % SLICE_HEIGHT;
485 
486     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
487     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
488     __syncthreads();
489     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
490     __syncthreads();
491     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
492     __syncthreads();
493     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
494     __syncthreads();
495     if (threadIdx.y < 1) {
496       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
497       z[row] = y[row] + shared[threadIdx.x];
498     }
499   }
500 }
501 
502 static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
503 {
504   __shared__ MatScalar shared[512];
505   PetscInt             i, row, slice_id, row_in_slice;
506   /* multiple threads per row. */
507   row = blockIdx.x * blockDim.x + threadIdx.x;
508   if (row < nrows) {
509     slice_id     = row / SLICE_HEIGHT;
510     row_in_slice = row % SLICE_HEIGHT;
511 
512     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
513     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
514     __syncthreads();
515     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
516     __syncthreads();
517     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
518     __syncthreads();
519     if (threadIdx.y < 1) {
520       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
521       z[row] = y[row] + shared[threadIdx.x];
522     }
523   }
524 }
525 
526 static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
527 {
528   __shared__ MatScalar shared[512];
529   PetscInt             i, row, slice_id, row_in_slice;
530   /* multiple threads per row. */
531   row = blockIdx.x * blockDim.x + threadIdx.x;
532   if (row < nrows) {
533     slice_id     = row / SLICE_HEIGHT;
534     row_in_slice = row % SLICE_HEIGHT;
535 
536     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
537     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
538     __syncthreads();
539     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
540     __syncthreads();
541     if (threadIdx.y < 1) {
542       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
543       z[row] = y[row] + shared[threadIdx.x];
544     }
545   }
546 }
547 
548 static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
549 {
550   __shared__ MatScalar shared[512];
551   PetscInt             i, row, slice_id, row_in_slice;
552   /* multiple threads per row. */
553   row = blockIdx.x * blockDim.x + threadIdx.x;
554   if (row < nrows) {
555     slice_id     = row / SLICE_HEIGHT;
556     row_in_slice = row % SLICE_HEIGHT;
557 
558     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
559     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
560     __syncthreads();
561     if (threadIdx.y < 1) {
562       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
563       z[row] = y[row] + shared[threadIdx.x];
564     }
565   }
566 }
567 
568 static PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
569 {
570   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
571   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
572   PetscScalar       *y;
573   const PetscScalar *x;
574   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
575   MatScalar         *aval;
576   PetscInt          *acolidx;
577   PetscInt          *sliidx;
578   PetscInt           nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
579   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
580 #if !defined(PETSC_USE_COMPLEX)
581   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
582   PetscReal maxoveravg;
583 #endif
584 
585   PetscFunctionBegin;
586   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
587   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
588   PetscCall(MatSeqSELLCUDACopyToGPU(A));
589   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
590   aval    = cudastruct->val;
591   acolidx = cudastruct->colidx;
592   sliidx  = cudastruct->sliidx;
593 
594   PetscCall(VecCUDAGetArrayRead(xx, &x));
595   PetscCall(VecCUDAGetArrayWrite(yy, &y));
596   PetscCall(PetscLogGpuTimeBegin());
597 
598   switch (cudastruct->kernelchoice) {
599 #if !defined(PETSC_USE_COMPLEX)
600   case 9:
601     nblocks = 1 + (nrows - 1) / sliceheight;
602     if (cudastruct->blocky == 2) {
603       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
604     } else if (cudastruct->blocky == 4) {
605       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
606     } else if (cudastruct->blocky == 8) {
607       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
608     } else if (cudastruct->blocky == 16) {
609       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
610     } else if (cudastruct->blocky == 32) {
611       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
612     } else {
613       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
614     }
615     break;
616   case 7:
617     nblocks = 1 + (nrows - 1) / (2 * sliceheight);
618     if (cudastruct->blocky == 2) {
619       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
620     } else if (cudastruct->blocky == 4) {
621       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
622     } else if (cudastruct->blocky == 8) {
623       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
624     } else if (cudastruct->blocky == 16) {
625       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
626     } else if (cudastruct->blocky == 32) {
627       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
628     } else {
629       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
630     }
631     break;
632 #endif
633   case 6:
634     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
635     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y);
636     break;
637   case 5:
638     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
639     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y);
640     break;
641   case 4:
642     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
643     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y);
644     break;
645   case 3:
646     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
647     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y);
648     break;
649   case 2: /* 16 slices per block if blocksize=512 */
650     nblocks = 1 + (nrows - 1) / (blocksize / 2);
651     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y);
652     break;
653   case 1: /* 32 slices per block if blocksize=512 */
654     nblocks = 1 + (nrows - 1) / blocksize;
655     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
656     break;
657 #if !defined(PETSC_USE_COMPLEX)
658   case 0:
659     maxoveravg = a->maxslicewidth / a->avgslicewidth;
660     if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
661       /* each block handles approximately one slice */
662       PetscInt blocky = a->chunksize / 32;
663       nchunks         = cudastruct->totalchunks;
664       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
665       nblocks         = 1 + (nchunks - 1) / chunksperblock;
666       chunk_slice_map = cudastruct->chunk_slice_map;
667       if (blocky == 2) {
668         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
669       } else if (blocky == 4) {
670         matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
671       } else if (blocky == 8) {
672         matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
673       } else if (blocky == 16) {
674         matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
675       } else if (blocky == 32) {
676         matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
677       } else {
678         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
679       }
680     } else {
681       PetscInt avgslicesize = sliceheight * a->avgslicewidth;
682       if (avgslicesize <= 432) {
683         if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
684           nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
685           matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
686         } else {
687           nblocks = 1 + (nrows - 1) / sliceheight;
688           matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
689         }
690       } else if (avgslicesize <= 2400) {
691         nblocks = 1 + (nrows - 1) / sliceheight;
692         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
693       } else {
694         nblocks = 1 + (nrows - 1) / sliceheight;
695         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
696       }
697     }
698     break;
699 #endif
700   default:
701     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
702   }
703   PetscCall(PetscLogGpuTimeEnd());
704   PetscCall(VecCUDARestoreArrayRead(xx, &x));
705   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
706   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
707   PetscFunctionReturn(PETSC_SUCCESS);
708 }
709 
710 static PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
711 {
712   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
713   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
714   PetscScalar       *z;
715   const PetscScalar *y, *x;
716   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
717   MatScalar         *aval    = cudastruct->val;
718   PetscInt          *acolidx = cudastruct->colidx;
719   PetscInt          *sliidx  = cudastruct->sliidx;
720 #if !defined(PETSC_USE_COMPLEX)
721   PetscReal maxoveravg;
722   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
723   PetscInt  blocky = cudastruct->blocky;
724 #endif
725 
726   PetscFunctionBegin;
727   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
728   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
729   PetscCall(MatSeqSELLCUDACopyToGPU(A));
730   if (a->nz) {
731     PetscInt nblocks, blocksize = 512;
732     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
733     PetscCall(VecCUDAGetArrayRead(xx, &x));
734     PetscCall(VecCUDAGetArrayRead(yy, &y));
735     PetscCall(VecCUDAGetArrayWrite(zz, &z));
736     PetscCall(PetscLogGpuTimeBegin());
737 
738     switch (cudastruct->kernelchoice) {
739 #if !defined(PETSC_USE_COMPLEX)
740     case 9:
741       nblocks = 1 + (nrows - 1) / sliceheight;
742       if (blocky == 2) {
743         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
744       } else if (blocky == 4) {
745         matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
746       } else if (blocky == 8) {
747         matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
748       } else if (blocky == 16) {
749         matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
750       } else if (blocky == 32) {
751         matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
752       } else {
753         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
754       }
755       break;
756     case 8:
757       /* each block handles approximately one slice */
758       nchunks         = cudastruct->totalchunks;
759       blocky          = a->chunksize / 32;
760       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
761       nblocks         = 1 + (nchunks - 1) / chunksperblock;
762       chunk_slice_map = cudastruct->chunk_slice_map;
763       if (blocky == 2) {
764         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
765       } else if (blocky == 4) {
766         matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
767       } else if (blocky == 8) {
768         matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
769       } else if (blocky == 16) {
770         matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
771       } else if (blocky == 32) {
772         matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
773       } else {
774         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
775       }
776       break;
777     case 7:
778       nblocks = 1 + (nrows - 1) / (2 * sliceheight);
779       if (blocky == 2) {
780         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
781       } else if (blocky == 4) {
782         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
783       } else if (blocky == 8) {
784         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
785       } else if (blocky == 16) {
786         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
787       } else if (blocky == 32) {
788         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
789       } else {
790         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
791       }
792       break;
793 #endif
794     case 6:
795       nblocks = 1 + (nrows - 1) / (blocksize / 32);
796       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z);
797       break;
798     case 5:
799       nblocks = 1 + (nrows - 1) / (blocksize / 16);
800       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z);
801       break;
802     case 4:
803       nblocks = 1 + (nrows - 1) / (blocksize / 8);
804       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z);
805       break;
806     case 3:
807       nblocks = 1 + (nrows - 1) / (blocksize / 4);
808       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z);
809       break;
810     case 2:
811       nblocks = 1 + (nrows - 1) / (blocksize / 2);
812       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z);
813       break;
814     case 1:
815       nblocks = 1 + (nrows - 1) / blocksize;
816       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
817       break;
818 #if !defined(PETSC_USE_COMPLEX)
819     case 0:
820       maxoveravg = a->maxslicewidth / a->avgslicewidth;
821       if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
822         /* each block handles approximately one slice */
823         nchunks         = cudastruct->totalchunks;
824         blocky          = a->chunksize / 32;
825         chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
826         nblocks         = 1 + (nchunks - 1) / chunksperblock;
827         chunk_slice_map = cudastruct->chunk_slice_map;
828         if (blocky == 2) {
829           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
830         } else if (blocky == 4) {
831           matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
832         } else if (blocky == 8) {
833           matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
834         } else if (blocky == 16) {
835           matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
836         } else if (blocky == 32) {
837           matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
838         } else {
839           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
840         }
841       } else {
842         PetscInt avgslicesize = sliceheight * a->avgslicewidth;
843         if (avgslicesize <= 432) {
844           if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
845             nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
846             matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
847           } else {
848             nblocks = 1 + (nrows - 1) / sliceheight;
849             matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
850           }
851         } else if (avgslicesize <= 2400) {
852           nblocks = 1 + (nrows - 1) / sliceheight;
853           matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
854         } else {
855           nblocks = 1 + (nrows - 1) / sliceheight;
856           matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
857         }
858       }
859       break;
860 #endif
861     default:
862       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
863     }
864     PetscCall(PetscLogGpuTimeEnd());
865     PetscCall(VecCUDARestoreArrayRead(xx, &x));
866     PetscCall(VecCUDARestoreArrayRead(yy, &y));
867     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
868     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
869   } else {
870     PetscCall(VecCopy(yy, zz));
871   }
872   PetscFunctionReturn(PETSC_SUCCESS);
873 }
874 
875 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject)
876 {
877   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
878   PetscInt         kernel, blocky;
879   PetscBool        flg;
880 
881   PetscFunctionBegin;
882   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
883   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
884   if (flg) {
885     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
886     cudastruct->blocky = blocky;
887   }
888   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
889   if (flg) {
890     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
891     cudastruct->kernelchoice = kernel;
892     if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg)); }
893   }
894   PetscOptionsHeadEnd();
895   PetscFunctionReturn(PETSC_SUCCESS);
896 }
897 
898 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
899 {
900   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
901 
902   PetscFunctionBegin;
903   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
904   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
905   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
906   PetscFunctionReturn(PETSC_SUCCESS);
907 }
908 
909 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
910 {
911   PetscFunctionBegin;
912   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
913   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
914   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
915   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); }
916   A->ops->mult    = MatMult_SeqSELLCUDA;
917   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
918   PetscFunctionReturn(PETSC_SUCCESS);
919 }
920 
921 static PetscErrorCode MatZeroEntries_SeqSELLCUDA(Mat A)
922 {
923   PetscBool    both = PETSC_FALSE;
924   Mat_SeqSELL *a    = (Mat_SeqSELL *)A->data;
925 
926   PetscFunctionBegin;
927   if (A->factortype == MAT_FACTOR_NONE) {
928     Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
929     if (cudastruct->val) {
930       both = PETSC_TRUE;
931       PetscCallCUDA(cudaMemset(cudastruct->val, 0, a->sliidx[a->totalslices] * sizeof(*cudastruct->val)));
932     }
933   }
934   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
935   PetscCall(MatSeqSELLInvalidateDiagonal(A));
936   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
937   else A->offloadmask = PETSC_OFFLOAD_CPU;
938   PetscFunctionReturn(PETSC_SUCCESS);
939 }
940 
941 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
942 {
943   PetscFunctionBegin;
944   if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr));
945   PetscCall(MatDestroy_SeqSELL(A));
946   PetscFunctionReturn(PETSC_SUCCESS);
947 }
948 
949 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
950 static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
951 {
952   PetscFunctionBegin;
953   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
954   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
955   PetscFunctionReturn(PETSC_SUCCESS);
956 }
957 
958 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
959 {
960   Mat_SeqSELLCUDA *cudastruct;
961 
962   PetscFunctionBegin;
963   PetscCall(PetscFree(B->defaultvectype));
964   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
965 
966   if (!B->spptr) {
967     if (B->factortype == MAT_FACTOR_NONE) {
968       PetscCall(PetscNew(&cudastruct));
969       B->spptr = cudastruct;
970     }
971   }
972 
973   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
974   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
975   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
976   B->ops->mult           = MatMult_SeqSELLCUDA;
977   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
978   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
979   B->ops->zeroentries    = MatZeroEntries_SeqSELLCUDA;
980 
981   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
982   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
983 
984   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
985   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
986   PetscFunctionReturn(PETSC_SUCCESS);
987 }
988 
989 /*MC
990   MATSEQSELLCUDA - MATSELLCUDA = "(seq)sellcuda" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
991 
992   Options Database Keys:
993 +  -mat_type seqsellcuda - sets the matrix type to "seqsellcuda" during a call to `MatSetFromOptions()`
994 .  -mat_sell_spmv_cuda_kernel - selects a spmv kernel for MatSELLCUDA
995 -  -mat_sell_spmv_cuda_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension being 32
996 
997   Level: beginner
998 
999 .seealso: [](ch_matrices), `Mat`, `MATSELLCUDA`
1000 M*/
1001 
1002 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
1003 {
1004   PetscFunctionBegin;
1005   PetscCall(MatCreate_SeqSELL(B));
1006   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
1007   PetscCall(MatSetFromOptions(B));
1008   PetscFunctionReturn(PETSC_SUCCESS);
1009 }
1010