xref: /petsc/src/mat/impls/sell/seq/seqcuda/sellcuda.cu (revision e8c0849ab8fe171bed529bea27238c9b402db591)
1 #include <petscdevice_cuda.h>
2 #include <petsc/private/cupmatomics.hpp>
3 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
4 
5 #define SLICE_HEIGHT 16
6 
7 typedef struct {
8   PetscInt   maxallocmat;
9   PetscInt   totalentries;
10   PetscInt  *colidx; /* column index array, device pointer */
11   MatScalar *val;    /* value array, device pointer */
12   PetscInt   totalslices;
13   PetscInt  *sliidx; /* slice index array, device pointer */
14   PetscInt   nonzerostate;
15   PetscInt   kernelchoice;
16   PetscInt   blocky;
17   PetscInt   chunksperblock;
18   PetscInt   totalchunks;
19   PetscInt  *chunk_slice_map; /* starting slice for each chunk, device pointer */
20 } Mat_SeqSELLCUDA;
21 
MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA ** cudastruct)22 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
23 {
24   PetscFunctionBegin;
25   if (*cudastruct) {
26     if ((*cudastruct)->colidx) PetscCallCUDA(cudaFree((*cudastruct)->colidx));
27     if ((*cudastruct)->val) PetscCallCUDA(cudaFree((*cudastruct)->val));
28     if ((*cudastruct)->sliidx) PetscCallCUDA(cudaFree((*cudastruct)->sliidx));
29     if ((*cudastruct)->chunk_slice_map) PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map));
30     PetscCall(PetscFree(*cudastruct));
31   }
32   PetscFunctionReturn(PETSC_SUCCESS);
33 }
34 
MatSeqSELLCUDACopyToGPU(Mat A)35 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
36 {
37   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
38   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;
39 
40   PetscFunctionBegin;
41   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
42     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
43     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
44       /* copy values only */
45       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
46       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
47     } else {
48       if (cudastruct->colidx) PetscCallCUDA(cudaFree(cudastruct->colidx));
49       if (cudastruct->val) PetscCallCUDA(cudaFree(cudastruct->val));
50       if (cudastruct->sliidx) PetscCallCUDA(cudaFree(cudastruct->sliidx));
51       if (cudastruct->chunk_slice_map) PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map));
52       cudastruct->maxallocmat  = a->maxallocmat;
53       cudastruct->totalentries = a->sliidx[a->totalslices];
54       cudastruct->totalslices  = a->totalslices;
55       cudastruct->totalchunks  = a->totalchunks;
56       PetscCallCUDA(cudaMalloc((void **)&cudastruct->colidx, a->maxallocmat * sizeof(*cudastruct->colidx)));
57       PetscCallCUDA(cudaMalloc((void **)&cudastruct->val, a->maxallocmat * sizeof(*cudastruct->val)));
58       /* copy values, nz or maxallocmat? */
59       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), cudaMemcpyHostToDevice));
60       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), cudaMemcpyHostToDevice));
61 
62       PetscCallCUDA(cudaMalloc((void **)&cudastruct->sliidx, (a->totalslices + 1) * sizeof(*cudastruct->sliidx)));
63       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), cudaMemcpyHostToDevice));
64       PetscCallCUDA(cudaMalloc((void **)&cudastruct->chunk_slice_map, a->totalchunks * sizeof(*cudastruct->chunk_slice_map)));
65       PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), cudaMemcpyHostToDevice));
66       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
67     }
68     PetscCallCUDA(WaitForCUDA());
69     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
70     A->offloadmask = PETSC_OFFLOAD_BOTH;
71   }
72   PetscFunctionReturn(PETSC_SUCCESS);
73 }
74 
matmult_seqsell_basic_kernel(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)75 static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
76 {
77   PetscInt  i, row, slice_id, row_in_slice;
78   MatScalar sum;
79   /* one thread per row. */
80   row = blockIdx.x * blockDim.x + threadIdx.x;
81   if (row < nrows) {
82     slice_id     = row / sliceheight;
83     row_in_slice = row % sliceheight;
84     sum          = 0.0;
85     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
86     y[row] = sum;
87   }
88 }
89 
matmultadd_seqsell_basic_kernel(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)90 static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
91 {
92   PetscInt  i, row, slice_id, row_in_slice;
93   MatScalar sum;
94   /* one thread per row. */
95   row = blockIdx.x * blockDim.x + threadIdx.x;
96   if (row < nrows) {
97     slice_id     = row / sliceheight;
98     row_in_slice = row % sliceheight;
99     sum          = 0.0;
100     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
101     z[row] = y[row] + sum;
102   }
103 }
104 
105 #if !defined(PETSC_USE_COMPLEX)
106 /* use 1 block per slice, suitable for large slice width */
107 template <int BLOCKY>
matmult_seqsell_tiled_kernel9(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)108 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
109 {
110   __shared__ MatScalar shared[32][BLOCKY];
111   PetscInt             i, row, slice_id = blockIdx.x;
112   int                  tid = threadIdx.x + threadIdx.y * 32;
113   /* transposed index */
114   int         tidx = tid % BLOCKY;
115   int         tidy = tid / BLOCKY;
116   PetscScalar t    = 0.0;
117 
118   row = slice_id * sliceheight + threadIdx.x % sliceheight;
119   if (row < nrows) {
120     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
121   }
122   #pragma unroll
123   for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
124   /* transpose layout to reduce each row using warp shfl */
125   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
126   __syncthreads();
127   if (tidy < sliceheight) t = shared[tidy][tidx];
128   #pragma unroll
129   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
130   if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
131   __syncthreads();
132   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x];
133 }
134 
135 /* use 1 block per slice, suitable for large slice width */
136 template <int BLOCKY>
matmultadd_seqsell_tiled_kernel9(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)137 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
138 {
139   __shared__ MatScalar shared[32][BLOCKY];
140   PetscInt             i, row, slice_id = blockIdx.x;
141   int                  tid = threadIdx.x + threadIdx.y * 32;
142   /* transposed index */
143   int         tidx = tid % BLOCKY;
144   int         tidy = tid / BLOCKY;
145   PetscScalar t    = 0.0;
146 
147   row = slice_id * sliceheight + threadIdx.x % sliceheight;
148   if (row < nrows) {
149     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
150   }
151   #pragma unroll
152   for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
153   /* transpose layout to reduce each row using warp shfl */
154   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
155   __syncthreads();
156   if (tidy < sliceheight) t = shared[tidy][tidx];
157   #pragma unroll
158   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
159   if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
160   __syncthreads();
161   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][threadIdx.x];
162 }
163 
164 template <int BLOCKY>
segment_scan(PetscInt flag[],MatScalar shared[],PetscScalar * val)165 __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
166 {
167   bool head = true;
168   #pragma unroll
169   for (int i = 1; i < BLOCKY * 2; i <<= 1) {
170     int halfwarpid                         = threadIdx.y * 2 + threadIdx.x / 16;
171     shared[threadIdx.x + threadIdx.y * 32] = 0;
172     if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
173       shared[threadIdx.x + threadIdx.y * 32] = *val;
174       if (i == 1) head = false;
175     }
176     __syncthreads();
177     if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16];
178     __syncthreads();
179   }
180   return head;
181 }
182 
183 /* load-balancing version. Chunksize is equal to the number of threads per block */
184 template <int BLOCKY>
matmult_seqsell_tiled_kernel8(PetscInt nrows,PetscInt sliceheight,PetscInt chunksperblock,PetscInt totalchunks,const PetscInt * chunk_slice_map,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)185 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
186 {
187   __shared__ MatScalar shared[BLOCKY * 32];
188   PetscInt             gid, row, start_slice, cid;
189   PetscScalar          t = 0.0;
190   AtomicAdd<MatScalar> atomAdd;
191   /* zero out y */
192   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
193     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
194     if (gid < nrows) y[gid] = 0.0;
195   }
196   for (int iter = 0; iter < chunksperblock; iter++) {
197     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
198     if (cid < totalchunks) {
199       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
200       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
201       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
202         __shared__ PetscInt flag[BLOCKY * 2];
203         bool                write;
204         PetscInt            slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
205         /* find out the slice that this element belongs to */
206         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
207         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
208         row = slice_id * sliceheight + threadIdx.x % sliceheight;
209         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
210         __syncthreads();
211         write = segment_scan<BLOCKY>(flag, shared, &t);
212         if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
213         t = 0.0;
214       } else { /* this iteration covers only one slice */
215         row = start_slice * sliceheight + threadIdx.x % sliceheight;
216         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
217         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
218           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
219   /* reduction and write to output vector */
220   #pragma unroll
221           for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
222           /* transpose layout to reduce each row using warp shfl */
223           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
224           __syncthreads();
225           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
226   #pragma unroll
227           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
228           if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
229           __syncthreads();
230           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
231           t = 0.0;
232         }
233       }
234     }
235   }
236 }
237 
238 /* load-balancing version. Chunksize is equal to the number of threads per block */
239 template <int BLOCKY>
matmultadd_seqsell_tiled_kernel8(PetscInt nrows,PetscInt sliceheight,PetscInt chunksperblock,PetscInt totalchunks,const PetscInt * chunk_slice_map,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)240 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
241 {
242   __shared__ MatScalar shared[BLOCKY * 32];
243   PetscInt             gid, row, start_slice, cid;
244   PetscScalar          t = 0.0;
245   AtomicAdd<MatScalar> atomAdd;
246   /* copy y to z */
247   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
248     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
249     if (gid < nrows) z[gid] = y[gid];
250   }
251   for (int iter = 0; iter < chunksperblock; iter++) {
252     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
253     if (cid < totalchunks) {
254       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
255       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
256       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
257         __shared__ PetscInt flag[BLOCKY * 2];
258         bool                write;
259         PetscInt            slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
260         /* find out the slice that this element belongs to */
261         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
262         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
263         row = slice_id * sliceheight + threadIdx.x % sliceheight;
264         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
265         __syncthreads();
266         write = segment_scan<BLOCKY>(flag, shared, &t);
267         if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
268         t = 0.0;
269       } else { /* this iteration covers only one slice */
270         row = start_slice * sliceheight + threadIdx.x % sliceheight;
271         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
272         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
273           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
274   /* reduction and write to output vector */
275   #pragma unroll
276           for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
277           /* transpose layout to reduce each row using warp shfl */
278           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
279           __syncthreads();
280           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
281   #pragma unroll
282           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
283           if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
284           __syncthreads();
285           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
286           t = 0.0;
287         }
288       }
289     }
290   }
291 }
292 
293 /* use 1 warp per slice, suitable for small slice width */
matmult_seqsell_tiled_kernel7(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)294 static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
295 {
296   PetscInt i, row, slice_id;
297   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
298   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
299   double t = 0.0;
300   if (row < nrows) {
301     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
302   }
303   #pragma unroll
304   for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
305   if (row < nrows && threadIdx.x < sliceheight) y[row] = t;
306 }
307 
308 /* use 1 warp per slice, suitable for small slice width */
matmultadd_seqsell_tiled_kernel7(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)309 static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
310 {
311   PetscInt i, row, slice_id;
312   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
313   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
314   double t = 0.0;
315   if (row < nrows) {
316     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
317   }
318   #pragma unroll
319   for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
320   if (row < nrows && threadIdx.x < sliceheight) z[row] = y[row] + t;
321 }
322 #endif
323 
324 /***********  Kernel 2-6  are tied to slice height 16. They are kept only for performance comparison  **********/
325 
matmult_seqsell_tiled_kernel6(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)326 static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
327 {
328   __shared__ MatScalar shared[512];
329   PetscInt             i, row, slice_id, row_in_slice;
330   /* multiple threads per row. */
331   row = blockIdx.x * blockDim.x + threadIdx.x;
332   if (row < nrows) {
333     slice_id     = row / SLICE_HEIGHT;
334     row_in_slice = row % SLICE_HEIGHT;
335 
336     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
337     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
338     __syncthreads();
339     if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
340     __syncthreads();
341     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
342     __syncthreads();
343     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
344     __syncthreads();
345     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
346     __syncthreads();
347     if (threadIdx.y < 1) {
348       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
349       y[row] = shared[threadIdx.x];
350     }
351   }
352 }
353 
matmult_seqsell_tiled_kernel5(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)354 static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
355 {
356   __shared__ MatScalar shared[512];
357   PetscInt             i, row, slice_id, row_in_slice;
358   /* multiple threads per row. */
359   row = blockIdx.x * blockDim.x + threadIdx.x;
360   if (row < nrows) {
361     slice_id     = row / SLICE_HEIGHT;
362     row_in_slice = row % SLICE_HEIGHT;
363 
364     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
365     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
366     __syncthreads();
367     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
368     __syncthreads();
369     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
370     __syncthreads();
371     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
372     __syncthreads();
373     if (threadIdx.y < 1) {
374       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
375       y[row] = shared[threadIdx.x];
376     }
377   }
378 }
379 
matmult_seqsell_tiled_kernel4(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)380 static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
381 {
382   __shared__ MatScalar shared[512];
383   PetscInt             i, row, slice_id, row_in_slice;
384   /* multiple threads per row. */
385   row = blockIdx.x * blockDim.x + threadIdx.x;
386   if (row < nrows) {
387     slice_id     = row / SLICE_HEIGHT;
388     row_in_slice = row % SLICE_HEIGHT;
389 
390     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
391     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
392     __syncthreads();
393     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
394     __syncthreads();
395     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
396     __syncthreads();
397     if (threadIdx.y < 1) {
398       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
399       y[row] = shared[threadIdx.x];
400     }
401   }
402 }
403 
matmult_seqsell_tiled_kernel3(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)404 static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
405 {
406   __shared__ MatScalar shared[512];
407   PetscInt             i, row, slice_id, row_in_slice;
408   /* multiple threads per row. */
409   row = blockIdx.x * blockDim.x + threadIdx.x;
410   if (row < nrows) {
411     slice_id     = row / SLICE_HEIGHT;
412     row_in_slice = row % SLICE_HEIGHT;
413 
414     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
415     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
416     __syncthreads();
417     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
418     __syncthreads();
419     if (threadIdx.y < 1) {
420       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
421       y[row] = shared[threadIdx.x];
422     }
423   }
424 }
425 
matmult_seqsell_tiled_kernel2(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)426 static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
427 {
428   __shared__ MatScalar shared[512];
429   PetscInt             i, row, slice_id, row_in_slice;
430   /* multiple threads per row. */
431   row = blockIdx.x * blockDim.x + threadIdx.x;
432   if (row < nrows) {
433     slice_id     = row / SLICE_HEIGHT;
434     row_in_slice = row % SLICE_HEIGHT;
435 
436     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
437     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
438     __syncthreads();
439     if (threadIdx.y < 1) {
440       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
441       y[row] = shared[threadIdx.x];
442     }
443   }
444 }
445 
matmultadd_seqsell_tiled_kernel6(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)446 static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
447 {
448   __shared__ MatScalar shared[512];
449   PetscInt             i, row, slice_id, row_in_slice;
450   /* multiple threads per row. */
451   row = blockIdx.x * blockDim.x + threadIdx.x;
452   if (row < nrows) {
453     slice_id     = row / SLICE_HEIGHT;
454     row_in_slice = row % SLICE_HEIGHT;
455 
456     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
457     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
458     __syncthreads();
459     if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
460     __syncthreads();
461     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
462     __syncthreads();
463     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
464     __syncthreads();
465     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
466     __syncthreads();
467     if (threadIdx.y < 1) {
468       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
469       z[row] = y[row] + shared[threadIdx.x];
470     }
471   }
472 }
473 
matmultadd_seqsell_tiled_kernel5(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)474 static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
475 {
476   __shared__ MatScalar shared[512];
477   PetscInt             i, row, slice_id, row_in_slice;
478   /* multiple threads per row. */
479   row = blockIdx.x * blockDim.x + threadIdx.x;
480   if (row < nrows) {
481     slice_id     = row / SLICE_HEIGHT;
482     row_in_slice = row % SLICE_HEIGHT;
483 
484     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
485     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
486     __syncthreads();
487     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
488     __syncthreads();
489     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
490     __syncthreads();
491     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
492     __syncthreads();
493     if (threadIdx.y < 1) {
494       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
495       z[row] = y[row] + shared[threadIdx.x];
496     }
497   }
498 }
499 
matmultadd_seqsell_tiled_kernel4(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)500 static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
501 {
502   __shared__ MatScalar shared[512];
503   PetscInt             i, row, slice_id, row_in_slice;
504   /* multiple threads per row. */
505   row = blockIdx.x * blockDim.x + threadIdx.x;
506   if (row < nrows) {
507     slice_id     = row / SLICE_HEIGHT;
508     row_in_slice = row % SLICE_HEIGHT;
509 
510     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
511     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
512     __syncthreads();
513     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
514     __syncthreads();
515     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
516     __syncthreads();
517     if (threadIdx.y < 1) {
518       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
519       z[row] = y[row] + shared[threadIdx.x];
520     }
521   }
522 }
523 
matmultadd_seqsell_tiled_kernel3(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)524 static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
525 {
526   __shared__ MatScalar shared[512];
527   PetscInt             i, row, slice_id, row_in_slice;
528   /* multiple threads per row. */
529   row = blockIdx.x * blockDim.x + threadIdx.x;
530   if (row < nrows) {
531     slice_id     = row / SLICE_HEIGHT;
532     row_in_slice = row % SLICE_HEIGHT;
533 
534     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
535     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
536     __syncthreads();
537     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
538     __syncthreads();
539     if (threadIdx.y < 1) {
540       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
541       z[row] = y[row] + shared[threadIdx.x];
542     }
543   }
544 }
545 
matmultadd_seqsell_tiled_kernel2(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)546 static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
547 {
548   __shared__ MatScalar shared[512];
549   PetscInt             i, row, slice_id, row_in_slice;
550   /* multiple threads per row. */
551   row = blockIdx.x * blockDim.x + threadIdx.x;
552   if (row < nrows) {
553     slice_id     = row / SLICE_HEIGHT;
554     row_in_slice = row % SLICE_HEIGHT;
555 
556     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
557     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
558     __syncthreads();
559     if (threadIdx.y < 1) {
560       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
561       z[row] = y[row] + shared[threadIdx.x];
562     }
563   }
564 }
565 
MatMult_SeqSELLCUDA(Mat A,Vec xx,Vec yy)566 static PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
567 {
568   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
569   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
570   PetscScalar       *y;
571   const PetscScalar *x;
572   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
573   MatScalar         *aval;
574   PetscInt          *acolidx;
575   PetscInt          *sliidx;
576   PetscInt           nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
577   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
578 #if !defined(PETSC_USE_COMPLEX)
579   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
580   PetscReal maxoveravg;
581 #endif
582 
583   PetscFunctionBegin;
584   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
585   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
586   PetscCall(MatSeqSELLCUDACopyToGPU(A));
587   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
588   aval    = cudastruct->val;
589   acolidx = cudastruct->colidx;
590   sliidx  = cudastruct->sliidx;
591 
592   PetscCall(VecCUDAGetArrayRead(xx, &x));
593   PetscCall(VecCUDAGetArrayWrite(yy, &y));
594   PetscCall(PetscLogGpuTimeBegin());
595 
596   switch (cudastruct->kernelchoice) {
597 #if !defined(PETSC_USE_COMPLEX)
598   case 9:
599     nblocks = 1 + (nrows - 1) / sliceheight;
600     if (cudastruct->blocky == 2) {
601       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
602     } else if (cudastruct->blocky == 4) {
603       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
604     } else if (cudastruct->blocky == 8) {
605       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
606     } else if (cudastruct->blocky == 16) {
607       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
608     } else if (cudastruct->blocky == 32) {
609       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
610     } else {
611       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
612     }
613     break;
614   case 7:
615     nblocks = 1 + (nrows - 1) / (2 * sliceheight);
616     if (cudastruct->blocky == 2) {
617       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
618     } else if (cudastruct->blocky == 4) {
619       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
620     } else if (cudastruct->blocky == 8) {
621       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
622     } else if (cudastruct->blocky == 16) {
623       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
624     } else if (cudastruct->blocky == 32) {
625       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
626     } else {
627       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
628     }
629     break;
630 #endif
631   case 6:
632     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
633     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y);
634     break;
635   case 5:
636     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
637     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y);
638     break;
639   case 4:
640     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
641     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y);
642     break;
643   case 3:
644     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
645     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y);
646     break;
647   case 2: /* 16 slices per block if blocksize=512 */
648     nblocks = 1 + (nrows - 1) / (blocksize / 2);
649     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y);
650     break;
651   case 1: /* 32 slices per block if blocksize=512 */
652     nblocks = 1 + (nrows - 1) / blocksize;
653     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
654     break;
655 #if !defined(PETSC_USE_COMPLEX)
656   case 0:
657     maxoveravg = a->maxslicewidth / a->avgslicewidth;
658     if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
659       /* each block handles approximately one slice */
660       PetscInt blocky = a->chunksize / 32;
661       nchunks         = cudastruct->totalchunks;
662       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
663       nblocks         = 1 + (nchunks - 1) / chunksperblock;
664       chunk_slice_map = cudastruct->chunk_slice_map;
665       if (blocky == 2) {
666         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
667       } else if (blocky == 4) {
668         matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
669       } else if (blocky == 8) {
670         matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
671       } else if (blocky == 16) {
672         matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
673       } else if (blocky == 32) {
674         matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
675       } else {
676         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
677       }
678     } else {
679       PetscInt avgslicesize = sliceheight * a->avgslicewidth;
680       if (avgslicesize <= 432) {
681         if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
682           nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
683           matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
684         } else {
685           nblocks = 1 + (nrows - 1) / sliceheight;
686           matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
687         }
688       } else if (avgslicesize <= 2400) {
689         nblocks = 1 + (nrows - 1) / sliceheight;
690         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
691       } else {
692         nblocks = 1 + (nrows - 1) / sliceheight;
693         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
694       }
695     }
696     break;
697 #endif
698   default:
699     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
700   }
701   PetscCall(PetscLogGpuTimeEnd());
702   PetscCall(VecCUDARestoreArrayRead(xx, &x));
703   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
704   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
705   PetscFunctionReturn(PETSC_SUCCESS);
706 }
707 
MatMultAdd_SeqSELLCUDA(Mat A,Vec xx,Vec yy,Vec zz)708 static PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
709 {
710   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
711   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
712   PetscScalar       *z;
713   const PetscScalar *y, *x;
714   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
715   MatScalar         *aval    = cudastruct->val;
716   PetscInt          *acolidx = cudastruct->colidx;
717   PetscInt          *sliidx  = cudastruct->sliidx;
718 #if !defined(PETSC_USE_COMPLEX)
719   PetscReal maxoveravg;
720   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
721   PetscInt  blocky = cudastruct->blocky;
722 #endif
723 
724   PetscFunctionBegin;
725   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
726   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
727   PetscCall(MatSeqSELLCUDACopyToGPU(A));
728   if (a->nz) {
729     PetscInt nblocks, blocksize = 512;
730     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
731     PetscCall(VecCUDAGetArrayRead(xx, &x));
732     PetscCall(VecCUDAGetArrayRead(yy, &y));
733     PetscCall(VecCUDAGetArrayWrite(zz, &z));
734     PetscCall(PetscLogGpuTimeBegin());
735 
736     switch (cudastruct->kernelchoice) {
737 #if !defined(PETSC_USE_COMPLEX)
738     case 9:
739       nblocks = 1 + (nrows - 1) / sliceheight;
740       if (blocky == 2) {
741         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
742       } else if (blocky == 4) {
743         matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
744       } else if (blocky == 8) {
745         matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
746       } else if (blocky == 16) {
747         matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
748       } else if (blocky == 32) {
749         matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
750       } else {
751         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
752       }
753       break;
754     case 8:
755       /* each block handles approximately one slice */
756       nchunks         = cudastruct->totalchunks;
757       blocky          = a->chunksize / 32;
758       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
759       nblocks         = 1 + (nchunks - 1) / chunksperblock;
760       chunk_slice_map = cudastruct->chunk_slice_map;
761       if (blocky == 2) {
762         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
763       } else if (blocky == 4) {
764         matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
765       } else if (blocky == 8) {
766         matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
767       } else if (blocky == 16) {
768         matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
769       } else if (blocky == 32) {
770         matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
771       } else {
772         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
773       }
774       break;
775     case 7:
776       nblocks = 1 + (nrows - 1) / (2 * sliceheight);
777       if (blocky == 2) {
778         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
779       } else if (blocky == 4) {
780         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
781       } else if (blocky == 8) {
782         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
783       } else if (blocky == 16) {
784         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
785       } else if (blocky == 32) {
786         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
787       } else {
788         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
789       }
790       break;
791 #endif
792     case 6:
793       nblocks = 1 + (nrows - 1) / (blocksize / 32);
794       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z);
795       break;
796     case 5:
797       nblocks = 1 + (nrows - 1) / (blocksize / 16);
798       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z);
799       break;
800     case 4:
801       nblocks = 1 + (nrows - 1) / (blocksize / 8);
802       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z);
803       break;
804     case 3:
805       nblocks = 1 + (nrows - 1) / (blocksize / 4);
806       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z);
807       break;
808     case 2:
809       nblocks = 1 + (nrows - 1) / (blocksize / 2);
810       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z);
811       break;
812     case 1:
813       nblocks = 1 + (nrows - 1) / blocksize;
814       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
815       break;
816 #if !defined(PETSC_USE_COMPLEX)
817     case 0:
818       maxoveravg = a->maxslicewidth / a->avgslicewidth;
819       if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
820         /* each block handles approximately one slice */
821         nchunks         = cudastruct->totalchunks;
822         blocky          = a->chunksize / 32;
823         chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
824         nblocks         = 1 + (nchunks - 1) / chunksperblock;
825         chunk_slice_map = cudastruct->chunk_slice_map;
826         if (blocky == 2) {
827           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
828         } else if (blocky == 4) {
829           matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
830         } else if (blocky == 8) {
831           matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
832         } else if (blocky == 16) {
833           matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
834         } else if (blocky == 32) {
835           matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
836         } else {
837           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
838         }
839       } else {
840         PetscInt avgslicesize = sliceheight * a->avgslicewidth;
841         if (avgslicesize <= 432) {
842           if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
843             nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
844             matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
845           } else {
846             nblocks = 1 + (nrows - 1) / sliceheight;
847             matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
848           }
849         } else if (avgslicesize <= 2400) {
850           nblocks = 1 + (nrows - 1) / sliceheight;
851           matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
852         } else {
853           nblocks = 1 + (nrows - 1) / sliceheight;
854           matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
855         }
856       }
857       break;
858 #endif
859     default:
860       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
861     }
862     PetscCall(PetscLogGpuTimeEnd());
863     PetscCall(VecCUDARestoreArrayRead(xx, &x));
864     PetscCall(VecCUDARestoreArrayRead(yy, &y));
865     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
866     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
867   } else {
868     PetscCall(VecCopy(yy, zz));
869   }
870   PetscFunctionReturn(PETSC_SUCCESS);
871 }
872 
MatSetFromOptions_SeqSELLCUDA(Mat A,PetscOptionItems PetscOptionsObject)873 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems PetscOptionsObject)
874 {
875   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
876   PetscInt         kernel, blocky;
877   PetscBool        flg;
878 
879   PetscFunctionBegin;
880   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
881   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
882   if (flg) {
883     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
884     cudastruct->blocky = blocky;
885   }
886   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
887   if (flg) {
888     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
889     cudastruct->kernelchoice = kernel;
890     if (kernel == 8) PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg));
891   }
892   PetscOptionsHeadEnd();
893   PetscFunctionReturn(PETSC_SUCCESS);
894 }
895 
MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)896 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
897 {
898   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
899 
900   PetscFunctionBegin;
901   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
902   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
903   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
904   PetscFunctionReturn(PETSC_SUCCESS);
905 }
906 
MatAssemblyEnd_SeqSELLCUDA(Mat A,MatAssemblyType mode)907 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
908 {
909   PetscFunctionBegin;
910   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
911   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
912   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
913   if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqSELLCUDACopyToGPU(A));
914   A->ops->mult    = MatMult_SeqSELLCUDA;
915   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
916   PetscFunctionReturn(PETSC_SUCCESS);
917 }
918 
MatZeroEntries_SeqSELLCUDA(Mat A)919 static PetscErrorCode MatZeroEntries_SeqSELLCUDA(Mat A)
920 {
921   PetscBool    both = PETSC_FALSE;
922   Mat_SeqSELL *a    = (Mat_SeqSELL *)A->data;
923 
924   PetscFunctionBegin;
925   if (A->factortype == MAT_FACTOR_NONE) {
926     Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
927     if (cudastruct->val) {
928       both = PETSC_TRUE;
929       PetscCallCUDA(cudaMemset(cudastruct->val, 0, a->sliidx[a->totalslices] * sizeof(*cudastruct->val)));
930     }
931   }
932   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
933   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
934   else A->offloadmask = PETSC_OFFLOAD_CPU;
935   PetscFunctionReturn(PETSC_SUCCESS);
936 }
937 
MatDestroy_SeqSELLCUDA(Mat A)938 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
939 {
940   PetscFunctionBegin;
941   if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr));
942   PetscCall(MatDestroy_SeqSELL(A));
943   PetscFunctionReturn(PETSC_SUCCESS);
944 }
945 
946 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
MatDuplicate_SeqSELLCUDA(Mat A,MatDuplicateOption cpvalues,Mat * B)947 static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
948 {
949   PetscFunctionBegin;
950   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
951   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
952   PetscFunctionReturn(PETSC_SUCCESS);
953 }
954 
MatConvert_SeqSELL_SeqSELLCUDA(Mat B)955 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
956 {
957   Mat_SeqSELLCUDA *cudastruct;
958 
959   PetscFunctionBegin;
960   PetscCall(PetscFree(B->defaultvectype));
961   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
962 
963   if (!B->spptr) {
964     if (B->factortype == MAT_FACTOR_NONE) {
965       PetscCall(PetscNew(&cudastruct));
966       B->spptr = cudastruct;
967     }
968   }
969 
970   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
971   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
972   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
973   B->ops->mult           = MatMult_SeqSELLCUDA;
974   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
975   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
976   B->ops->zeroentries    = MatZeroEntries_SeqSELLCUDA;
977 
978   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
979   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
980 
981   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
982   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
983   PetscFunctionReturn(PETSC_SUCCESS);
984 }
985 
986 /*MC
987   MATSEQSELLCUDA - MATSELLCUDA = "(seq)sellcuda" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
988 
989   Options Database Keys:
990 +  -mat_type seqsellcuda - sets the matrix type to "seqsellcuda" during a call to `MatSetFromOptions()`
991 .  -mat_sell_spmv_cuda_kernel - selects a spmv kernel for MatSELLCUDA
992 -  -mat_sell_spmv_cuda_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension being 32
993 
994   Level: beginner
995 
996 .seealso: [](ch_matrices), `Mat`, `MATSELLCUDA`
997 M*/
998 
MatCreate_SeqSELLCUDA(Mat B)999 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
1000 {
1001   PetscFunctionBegin;
1002   PetscCall(MatCreate_SeqSELL(B));
1003   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
1004   PetscCall(MatSetFromOptions(B));
1005   PetscFunctionReturn(PETSC_SUCCESS);
1006 }
1007