1 #include <petscdevice_cuda.h>
2 #include <petsc/private/cupmatomics.hpp>
3 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/
4
5 #define SLICE_HEIGHT 16
6
7 typedef struct {
8 PetscInt maxallocmat;
9 PetscInt totalentries;
10 PetscInt *colidx; /* column index array, device pointer */
11 MatScalar *val; /* value array, device pointer */
12 PetscInt totalslices;
13 PetscInt *sliidx; /* slice index array, device pointer */
14 PetscInt nonzerostate;
15 PetscInt kernelchoice;
16 PetscInt blocky;
17 PetscInt chunksperblock;
18 PetscInt totalchunks;
19 PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */
20 } Mat_SeqSELLCUDA;
21
MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA ** cudastruct)22 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
23 {
24 PetscFunctionBegin;
25 if (*cudastruct) {
26 if ((*cudastruct)->colidx) PetscCallCUDA(cudaFree((*cudastruct)->colidx));
27 if ((*cudastruct)->val) PetscCallCUDA(cudaFree((*cudastruct)->val));
28 if ((*cudastruct)->sliidx) PetscCallCUDA(cudaFree((*cudastruct)->sliidx));
29 if ((*cudastruct)->chunk_slice_map) PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map));
30 PetscCall(PetscFree(*cudastruct));
31 }
32 PetscFunctionReturn(PETSC_SUCCESS);
33 }
34
MatSeqSELLCUDACopyToGPU(Mat A)35 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
36 {
37 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
38 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
39
40 PetscFunctionBegin;
41 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
42 PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
43 if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
44 /* copy values only */
45 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
46 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
47 } else {
48 if (cudastruct->colidx) PetscCallCUDA(cudaFree(cudastruct->colidx));
49 if (cudastruct->val) PetscCallCUDA(cudaFree(cudastruct->val));
50 if (cudastruct->sliidx) PetscCallCUDA(cudaFree(cudastruct->sliidx));
51 if (cudastruct->chunk_slice_map) PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map));
52 cudastruct->maxallocmat = a->maxallocmat;
53 cudastruct->totalentries = a->sliidx[a->totalslices];
54 cudastruct->totalslices = a->totalslices;
55 cudastruct->totalchunks = a->totalchunks;
56 PetscCallCUDA(cudaMalloc((void **)&cudastruct->colidx, a->maxallocmat * sizeof(*cudastruct->colidx)));
57 PetscCallCUDA(cudaMalloc((void **)&cudastruct->val, a->maxallocmat * sizeof(*cudastruct->val)));
58 /* copy values, nz or maxallocmat? */
59 PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), cudaMemcpyHostToDevice));
60 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), cudaMemcpyHostToDevice));
61
62 PetscCallCUDA(cudaMalloc((void **)&cudastruct->sliidx, (a->totalslices + 1) * sizeof(*cudastruct->sliidx)));
63 PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), cudaMemcpyHostToDevice));
64 PetscCallCUDA(cudaMalloc((void **)&cudastruct->chunk_slice_map, a->totalchunks * sizeof(*cudastruct->chunk_slice_map)));
65 PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), cudaMemcpyHostToDevice));
66 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
67 }
68 PetscCallCUDA(WaitForCUDA());
69 PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
70 A->offloadmask = PETSC_OFFLOAD_BOTH;
71 }
72 PetscFunctionReturn(PETSC_SUCCESS);
73 }
74
matmult_seqsell_basic_kernel(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)75 static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
76 {
77 PetscInt i, row, slice_id, row_in_slice;
78 MatScalar sum;
79 /* one thread per row. */
80 row = blockIdx.x * blockDim.x + threadIdx.x;
81 if (row < nrows) {
82 slice_id = row / sliceheight;
83 row_in_slice = row % sliceheight;
84 sum = 0.0;
85 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
86 y[row] = sum;
87 }
88 }
89
matmultadd_seqsell_basic_kernel(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)90 static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
91 {
92 PetscInt i, row, slice_id, row_in_slice;
93 MatScalar sum;
94 /* one thread per row. */
95 row = blockIdx.x * blockDim.x + threadIdx.x;
96 if (row < nrows) {
97 slice_id = row / sliceheight;
98 row_in_slice = row % sliceheight;
99 sum = 0.0;
100 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
101 z[row] = y[row] + sum;
102 }
103 }
104
105 #if !defined(PETSC_USE_COMPLEX)
106 /* use 1 block per slice, suitable for large slice width */
107 template <int BLOCKY>
matmult_seqsell_tiled_kernel9(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)108 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
109 {
110 __shared__ MatScalar shared[32][BLOCKY];
111 PetscInt i, row, slice_id = blockIdx.x;
112 int tid = threadIdx.x + threadIdx.y * 32;
113 /* transposed index */
114 int tidx = tid % BLOCKY;
115 int tidy = tid / BLOCKY;
116 PetscScalar t = 0.0;
117
118 row = slice_id * sliceheight + threadIdx.x % sliceheight;
119 if (row < nrows) {
120 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
121 }
122 #pragma unroll
123 for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
124 /* transpose layout to reduce each row using warp shfl */
125 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
126 __syncthreads();
127 if (tidy < sliceheight) t = shared[tidy][tidx];
128 #pragma unroll
129 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
130 if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
131 __syncthreads();
132 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x];
133 }
134
135 /* use 1 block per slice, suitable for large slice width */
136 template <int BLOCKY>
matmultadd_seqsell_tiled_kernel9(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)137 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
138 {
139 __shared__ MatScalar shared[32][BLOCKY];
140 PetscInt i, row, slice_id = blockIdx.x;
141 int tid = threadIdx.x + threadIdx.y * 32;
142 /* transposed index */
143 int tidx = tid % BLOCKY;
144 int tidy = tid / BLOCKY;
145 PetscScalar t = 0.0;
146
147 row = slice_id * sliceheight + threadIdx.x % sliceheight;
148 if (row < nrows) {
149 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
150 }
151 #pragma unroll
152 for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
153 /* transpose layout to reduce each row using warp shfl */
154 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
155 __syncthreads();
156 if (tidy < sliceheight) t = shared[tidy][tidx];
157 #pragma unroll
158 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
159 if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
160 __syncthreads();
161 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][threadIdx.x];
162 }
163
164 template <int BLOCKY>
segment_scan(PetscInt flag[],MatScalar shared[],PetscScalar * val)165 __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
166 {
167 bool head = true;
168 #pragma unroll
169 for (int i = 1; i < BLOCKY * 2; i <<= 1) {
170 int halfwarpid = threadIdx.y * 2 + threadIdx.x / 16;
171 shared[threadIdx.x + threadIdx.y * 32] = 0;
172 if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
173 shared[threadIdx.x + threadIdx.y * 32] = *val;
174 if (i == 1) head = false;
175 }
176 __syncthreads();
177 if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16];
178 __syncthreads();
179 }
180 return head;
181 }
182
183 /* load-balancing version. Chunksize is equal to the number of threads per block */
184 template <int BLOCKY>
matmult_seqsell_tiled_kernel8(PetscInt nrows,PetscInt sliceheight,PetscInt chunksperblock,PetscInt totalchunks,const PetscInt * chunk_slice_map,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)185 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
186 {
187 __shared__ MatScalar shared[BLOCKY * 32];
188 PetscInt gid, row, start_slice, cid;
189 PetscScalar t = 0.0;
190 AtomicAdd<MatScalar> atomAdd;
191 /* zero out y */
192 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
193 gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
194 if (gid < nrows) y[gid] = 0.0;
195 }
196 for (int iter = 0; iter < chunksperblock; iter++) {
197 cid = blockIdx.x * chunksperblock + iter; /* chunk id */
198 if (cid < totalchunks) {
199 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
200 gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
201 if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
202 __shared__ PetscInt flag[BLOCKY * 2];
203 bool write;
204 PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
205 /* find out the slice that this element belongs to */
206 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
207 if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
208 row = slice_id * sliceheight + threadIdx.x % sliceheight;
209 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
210 __syncthreads();
211 write = segment_scan<BLOCKY>(flag, shared, &t);
212 if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
213 t = 0.0;
214 } else { /* this iteration covers only one slice */
215 row = start_slice * sliceheight + threadIdx.x % sliceheight;
216 if (row < nrows) t += aval[gid] * x[acolidx[gid]];
217 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
218 int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
219 /* reduction and write to output vector */
220 #pragma unroll
221 for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
222 /* transpose layout to reduce each row using warp shfl */
223 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
224 __syncthreads();
225 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
226 #pragma unroll
227 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
228 if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
229 __syncthreads();
230 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
231 t = 0.0;
232 }
233 }
234 }
235 }
236 }
237
238 /* load-balancing version. Chunksize is equal to the number of threads per block */
239 template <int BLOCKY>
matmultadd_seqsell_tiled_kernel8(PetscInt nrows,PetscInt sliceheight,PetscInt chunksperblock,PetscInt totalchunks,const PetscInt * chunk_slice_map,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)240 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
241 {
242 __shared__ MatScalar shared[BLOCKY * 32];
243 PetscInt gid, row, start_slice, cid;
244 PetscScalar t = 0.0;
245 AtomicAdd<MatScalar> atomAdd;
246 /* copy y to z */
247 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
248 gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
249 if (gid < nrows) z[gid] = y[gid];
250 }
251 for (int iter = 0; iter < chunksperblock; iter++) {
252 cid = blockIdx.x * chunksperblock + iter; /* chunk id */
253 if (cid < totalchunks) {
254 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
255 gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
256 if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
257 __shared__ PetscInt flag[BLOCKY * 2];
258 bool write;
259 PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
260 /* find out the slice that this element belongs to */
261 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
262 if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
263 row = slice_id * sliceheight + threadIdx.x % sliceheight;
264 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
265 __syncthreads();
266 write = segment_scan<BLOCKY>(flag, shared, &t);
267 if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
268 t = 0.0;
269 } else { /* this iteration covers only one slice */
270 row = start_slice * sliceheight + threadIdx.x % sliceheight;
271 if (row < nrows) t += aval[gid] * x[acolidx[gid]];
272 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
273 int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
274 /* reduction and write to output vector */
275 #pragma unroll
276 for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
277 /* transpose layout to reduce each row using warp shfl */
278 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
279 __syncthreads();
280 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
281 #pragma unroll
282 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
283 if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
284 __syncthreads();
285 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
286 t = 0.0;
287 }
288 }
289 }
290 }
291 }
292
293 /* use 1 warp per slice, suitable for small slice width */
matmult_seqsell_tiled_kernel7(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)294 static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
295 {
296 PetscInt i, row, slice_id;
297 slice_id = blockIdx.x * blockDim.y + threadIdx.y;
298 row = slice_id * sliceheight + threadIdx.x % sliceheight;
299 double t = 0.0;
300 if (row < nrows) {
301 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
302 }
303 #pragma unroll
304 for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
305 if (row < nrows && threadIdx.x < sliceheight) y[row] = t;
306 }
307
308 /* use 1 warp per slice, suitable for small slice width */
matmultadd_seqsell_tiled_kernel7(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)309 static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
310 {
311 PetscInt i, row, slice_id;
312 slice_id = blockIdx.x * blockDim.y + threadIdx.y;
313 row = slice_id * sliceheight + threadIdx.x % sliceheight;
314 double t = 0.0;
315 if (row < nrows) {
316 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
317 }
318 #pragma unroll
319 for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
320 if (row < nrows && threadIdx.x < sliceheight) z[row] = y[row] + t;
321 }
322 #endif
323
324 /*********** Kernel 2-6 are tied to slice height 16. They are kept only for performance comparison **********/
325
matmult_seqsell_tiled_kernel6(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)326 static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
327 {
328 __shared__ MatScalar shared[512];
329 PetscInt i, row, slice_id, row_in_slice;
330 /* multiple threads per row. */
331 row = blockIdx.x * blockDim.x + threadIdx.x;
332 if (row < nrows) {
333 slice_id = row / SLICE_HEIGHT;
334 row_in_slice = row % SLICE_HEIGHT;
335
336 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
337 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
338 __syncthreads();
339 if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
340 __syncthreads();
341 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
342 __syncthreads();
343 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
344 __syncthreads();
345 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
346 __syncthreads();
347 if (threadIdx.y < 1) {
348 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
349 y[row] = shared[threadIdx.x];
350 }
351 }
352 }
353
matmult_seqsell_tiled_kernel5(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)354 static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
355 {
356 __shared__ MatScalar shared[512];
357 PetscInt i, row, slice_id, row_in_slice;
358 /* multiple threads per row. */
359 row = blockIdx.x * blockDim.x + threadIdx.x;
360 if (row < nrows) {
361 slice_id = row / SLICE_HEIGHT;
362 row_in_slice = row % SLICE_HEIGHT;
363
364 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
365 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
366 __syncthreads();
367 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
368 __syncthreads();
369 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
370 __syncthreads();
371 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
372 __syncthreads();
373 if (threadIdx.y < 1) {
374 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
375 y[row] = shared[threadIdx.x];
376 }
377 }
378 }
379
matmult_seqsell_tiled_kernel4(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)380 static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
381 {
382 __shared__ MatScalar shared[512];
383 PetscInt i, row, slice_id, row_in_slice;
384 /* multiple threads per row. */
385 row = blockIdx.x * blockDim.x + threadIdx.x;
386 if (row < nrows) {
387 slice_id = row / SLICE_HEIGHT;
388 row_in_slice = row % SLICE_HEIGHT;
389
390 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
391 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
392 __syncthreads();
393 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
394 __syncthreads();
395 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
396 __syncthreads();
397 if (threadIdx.y < 1) {
398 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
399 y[row] = shared[threadIdx.x];
400 }
401 }
402 }
403
matmult_seqsell_tiled_kernel3(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)404 static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
405 {
406 __shared__ MatScalar shared[512];
407 PetscInt i, row, slice_id, row_in_slice;
408 /* multiple threads per row. */
409 row = blockIdx.x * blockDim.x + threadIdx.x;
410 if (row < nrows) {
411 slice_id = row / SLICE_HEIGHT;
412 row_in_slice = row % SLICE_HEIGHT;
413
414 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
415 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
416 __syncthreads();
417 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
418 __syncthreads();
419 if (threadIdx.y < 1) {
420 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
421 y[row] = shared[threadIdx.x];
422 }
423 }
424 }
425
matmult_seqsell_tiled_kernel2(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)426 static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
427 {
428 __shared__ MatScalar shared[512];
429 PetscInt i, row, slice_id, row_in_slice;
430 /* multiple threads per row. */
431 row = blockIdx.x * blockDim.x + threadIdx.x;
432 if (row < nrows) {
433 slice_id = row / SLICE_HEIGHT;
434 row_in_slice = row % SLICE_HEIGHT;
435
436 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
437 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
438 __syncthreads();
439 if (threadIdx.y < 1) {
440 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
441 y[row] = shared[threadIdx.x];
442 }
443 }
444 }
445
matmultadd_seqsell_tiled_kernel6(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)446 static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
447 {
448 __shared__ MatScalar shared[512];
449 PetscInt i, row, slice_id, row_in_slice;
450 /* multiple threads per row. */
451 row = blockIdx.x * blockDim.x + threadIdx.x;
452 if (row < nrows) {
453 slice_id = row / SLICE_HEIGHT;
454 row_in_slice = row % SLICE_HEIGHT;
455
456 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
457 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
458 __syncthreads();
459 if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
460 __syncthreads();
461 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
462 __syncthreads();
463 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
464 __syncthreads();
465 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
466 __syncthreads();
467 if (threadIdx.y < 1) {
468 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
469 z[row] = y[row] + shared[threadIdx.x];
470 }
471 }
472 }
473
matmultadd_seqsell_tiled_kernel5(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)474 static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
475 {
476 __shared__ MatScalar shared[512];
477 PetscInt i, row, slice_id, row_in_slice;
478 /* multiple threads per row. */
479 row = blockIdx.x * blockDim.x + threadIdx.x;
480 if (row < nrows) {
481 slice_id = row / SLICE_HEIGHT;
482 row_in_slice = row % SLICE_HEIGHT;
483
484 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
485 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
486 __syncthreads();
487 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
488 __syncthreads();
489 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
490 __syncthreads();
491 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
492 __syncthreads();
493 if (threadIdx.y < 1) {
494 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
495 z[row] = y[row] + shared[threadIdx.x];
496 }
497 }
498 }
499
matmultadd_seqsell_tiled_kernel4(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)500 static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
501 {
502 __shared__ MatScalar shared[512];
503 PetscInt i, row, slice_id, row_in_slice;
504 /* multiple threads per row. */
505 row = blockIdx.x * blockDim.x + threadIdx.x;
506 if (row < nrows) {
507 slice_id = row / SLICE_HEIGHT;
508 row_in_slice = row % SLICE_HEIGHT;
509
510 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
511 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
512 __syncthreads();
513 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
514 __syncthreads();
515 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
516 __syncthreads();
517 if (threadIdx.y < 1) {
518 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
519 z[row] = y[row] + shared[threadIdx.x];
520 }
521 }
522 }
523
matmultadd_seqsell_tiled_kernel3(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)524 static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
525 {
526 __shared__ MatScalar shared[512];
527 PetscInt i, row, slice_id, row_in_slice;
528 /* multiple threads per row. */
529 row = blockIdx.x * blockDim.x + threadIdx.x;
530 if (row < nrows) {
531 slice_id = row / SLICE_HEIGHT;
532 row_in_slice = row % SLICE_HEIGHT;
533
534 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
535 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
536 __syncthreads();
537 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
538 __syncthreads();
539 if (threadIdx.y < 1) {
540 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
541 z[row] = y[row] + shared[threadIdx.x];
542 }
543 }
544 }
545
matmultadd_seqsell_tiled_kernel2(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)546 static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
547 {
548 __shared__ MatScalar shared[512];
549 PetscInt i, row, slice_id, row_in_slice;
550 /* multiple threads per row. */
551 row = blockIdx.x * blockDim.x + threadIdx.x;
552 if (row < nrows) {
553 slice_id = row / SLICE_HEIGHT;
554 row_in_slice = row % SLICE_HEIGHT;
555
556 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
557 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
558 __syncthreads();
559 if (threadIdx.y < 1) {
560 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
561 z[row] = y[row] + shared[threadIdx.x];
562 }
563 }
564 }
565
MatMult_SeqSELLCUDA(Mat A,Vec xx,Vec yy)566 static PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
567 {
568 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
569 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
570 PetscScalar *y;
571 const PetscScalar *x;
572 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight;
573 MatScalar *aval;
574 PetscInt *acolidx;
575 PetscInt *sliidx;
576 PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
577 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
578 #if !defined(PETSC_USE_COMPLEX)
579 PetscInt chunksperblock, nchunks, *chunk_slice_map;
580 PetscReal maxoveravg;
581 #endif
582
583 PetscFunctionBegin;
584 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
585 PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
586 PetscCall(MatSeqSELLCUDACopyToGPU(A));
587 /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
588 aval = cudastruct->val;
589 acolidx = cudastruct->colidx;
590 sliidx = cudastruct->sliidx;
591
592 PetscCall(VecCUDAGetArrayRead(xx, &x));
593 PetscCall(VecCUDAGetArrayWrite(yy, &y));
594 PetscCall(PetscLogGpuTimeBegin());
595
596 switch (cudastruct->kernelchoice) {
597 #if !defined(PETSC_USE_COMPLEX)
598 case 9:
599 nblocks = 1 + (nrows - 1) / sliceheight;
600 if (cudastruct->blocky == 2) {
601 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
602 } else if (cudastruct->blocky == 4) {
603 matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
604 } else if (cudastruct->blocky == 8) {
605 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
606 } else if (cudastruct->blocky == 16) {
607 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
608 } else if (cudastruct->blocky == 32) {
609 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
610 } else {
611 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
612 }
613 break;
614 case 7:
615 nblocks = 1 + (nrows - 1) / (2 * sliceheight);
616 if (cudastruct->blocky == 2) {
617 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
618 } else if (cudastruct->blocky == 4) {
619 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
620 } else if (cudastruct->blocky == 8) {
621 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
622 } else if (cudastruct->blocky == 16) {
623 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
624 } else if (cudastruct->blocky == 32) {
625 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
626 } else {
627 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
628 }
629 break;
630 #endif
631 case 6:
632 nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
633 matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y);
634 break;
635 case 5:
636 nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
637 matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y);
638 break;
639 case 4:
640 nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
641 matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y);
642 break;
643 case 3:
644 nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
645 matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y);
646 break;
647 case 2: /* 16 slices per block if blocksize=512 */
648 nblocks = 1 + (nrows - 1) / (blocksize / 2);
649 matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y);
650 break;
651 case 1: /* 32 slices per block if blocksize=512 */
652 nblocks = 1 + (nrows - 1) / blocksize;
653 matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
654 break;
655 #if !defined(PETSC_USE_COMPLEX)
656 case 0:
657 maxoveravg = a->maxslicewidth / a->avgslicewidth;
658 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
659 /* each block handles approximately one slice */
660 PetscInt blocky = a->chunksize / 32;
661 nchunks = cudastruct->totalchunks;
662 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
663 nblocks = 1 + (nchunks - 1) / chunksperblock;
664 chunk_slice_map = cudastruct->chunk_slice_map;
665 if (blocky == 2) {
666 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
667 } else if (blocky == 4) {
668 matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
669 } else if (blocky == 8) {
670 matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
671 } else if (blocky == 16) {
672 matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
673 } else if (blocky == 32) {
674 matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
675 } else {
676 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
677 }
678 } else {
679 PetscInt avgslicesize = sliceheight * a->avgslicewidth;
680 if (avgslicesize <= 432) {
681 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
682 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
683 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
684 } else {
685 nblocks = 1 + (nrows - 1) / sliceheight;
686 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
687 }
688 } else if (avgslicesize <= 2400) {
689 nblocks = 1 + (nrows - 1) / sliceheight;
690 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
691 } else {
692 nblocks = 1 + (nrows - 1) / sliceheight;
693 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
694 }
695 }
696 break;
697 #endif
698 default:
699 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
700 }
701 PetscCall(PetscLogGpuTimeEnd());
702 PetscCall(VecCUDARestoreArrayRead(xx, &x));
703 PetscCall(VecCUDARestoreArrayWrite(yy, &y));
704 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
705 PetscFunctionReturn(PETSC_SUCCESS);
706 }
707
MatMultAdd_SeqSELLCUDA(Mat A,Vec xx,Vec yy,Vec zz)708 static PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
709 {
710 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
711 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
712 PetscScalar *z;
713 const PetscScalar *y, *x;
714 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight;
715 MatScalar *aval = cudastruct->val;
716 PetscInt *acolidx = cudastruct->colidx;
717 PetscInt *sliidx = cudastruct->sliidx;
718 #if !defined(PETSC_USE_COMPLEX)
719 PetscReal maxoveravg;
720 PetscInt chunksperblock, nchunks, *chunk_slice_map;
721 PetscInt blocky = cudastruct->blocky;
722 #endif
723
724 PetscFunctionBegin;
725 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
726 PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
727 PetscCall(MatSeqSELLCUDACopyToGPU(A));
728 if (a->nz) {
729 PetscInt nblocks, blocksize = 512;
730 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
731 PetscCall(VecCUDAGetArrayRead(xx, &x));
732 PetscCall(VecCUDAGetArrayRead(yy, &y));
733 PetscCall(VecCUDAGetArrayWrite(zz, &z));
734 PetscCall(PetscLogGpuTimeBegin());
735
736 switch (cudastruct->kernelchoice) {
737 #if !defined(PETSC_USE_COMPLEX)
738 case 9:
739 nblocks = 1 + (nrows - 1) / sliceheight;
740 if (blocky == 2) {
741 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
742 } else if (blocky == 4) {
743 matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
744 } else if (blocky == 8) {
745 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
746 } else if (blocky == 16) {
747 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
748 } else if (blocky == 32) {
749 matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
750 } else {
751 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
752 }
753 break;
754 case 8:
755 /* each block handles approximately one slice */
756 nchunks = cudastruct->totalchunks;
757 blocky = a->chunksize / 32;
758 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
759 nblocks = 1 + (nchunks - 1) / chunksperblock;
760 chunk_slice_map = cudastruct->chunk_slice_map;
761 if (blocky == 2) {
762 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
763 } else if (blocky == 4) {
764 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
765 } else if (blocky == 8) {
766 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
767 } else if (blocky == 16) {
768 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
769 } else if (blocky == 32) {
770 matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
771 } else {
772 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
773 }
774 break;
775 case 7:
776 nblocks = 1 + (nrows - 1) / (2 * sliceheight);
777 if (blocky == 2) {
778 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
779 } else if (blocky == 4) {
780 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
781 } else if (blocky == 8) {
782 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
783 } else if (blocky == 16) {
784 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
785 } else if (blocky == 32) {
786 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
787 } else {
788 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
789 }
790 break;
791 #endif
792 case 6:
793 nblocks = 1 + (nrows - 1) / (blocksize / 32);
794 matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z);
795 break;
796 case 5:
797 nblocks = 1 + (nrows - 1) / (blocksize / 16);
798 matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z);
799 break;
800 case 4:
801 nblocks = 1 + (nrows - 1) / (blocksize / 8);
802 matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z);
803 break;
804 case 3:
805 nblocks = 1 + (nrows - 1) / (blocksize / 4);
806 matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z);
807 break;
808 case 2:
809 nblocks = 1 + (nrows - 1) / (blocksize / 2);
810 matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z);
811 break;
812 case 1:
813 nblocks = 1 + (nrows - 1) / blocksize;
814 matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
815 break;
816 #if !defined(PETSC_USE_COMPLEX)
817 case 0:
818 maxoveravg = a->maxslicewidth / a->avgslicewidth;
819 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
820 /* each block handles approximately one slice */
821 nchunks = cudastruct->totalchunks;
822 blocky = a->chunksize / 32;
823 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
824 nblocks = 1 + (nchunks - 1) / chunksperblock;
825 chunk_slice_map = cudastruct->chunk_slice_map;
826 if (blocky == 2) {
827 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
828 } else if (blocky == 4) {
829 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
830 } else if (blocky == 8) {
831 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
832 } else if (blocky == 16) {
833 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
834 } else if (blocky == 32) {
835 matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
836 } else {
837 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
838 }
839 } else {
840 PetscInt avgslicesize = sliceheight * a->avgslicewidth;
841 if (avgslicesize <= 432) {
842 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
843 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
844 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
845 } else {
846 nblocks = 1 + (nrows - 1) / sliceheight;
847 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
848 }
849 } else if (avgslicesize <= 2400) {
850 nblocks = 1 + (nrows - 1) / sliceheight;
851 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
852 } else {
853 nblocks = 1 + (nrows - 1) / sliceheight;
854 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
855 }
856 }
857 break;
858 #endif
859 default:
860 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
861 }
862 PetscCall(PetscLogGpuTimeEnd());
863 PetscCall(VecCUDARestoreArrayRead(xx, &x));
864 PetscCall(VecCUDARestoreArrayRead(yy, &y));
865 PetscCall(VecCUDARestoreArrayWrite(zz, &z));
866 PetscCall(PetscLogGpuFlops(2.0 * a->nz));
867 } else {
868 PetscCall(VecCopy(yy, zz));
869 }
870 PetscFunctionReturn(PETSC_SUCCESS);
871 }
872
MatSetFromOptions_SeqSELLCUDA(Mat A,PetscOptionItems PetscOptionsObject)873 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems PetscOptionsObject)
874 {
875 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
876 PetscInt kernel, blocky;
877 PetscBool flg;
878
879 PetscFunctionBegin;
880 PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
881 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
882 if (flg) {
883 PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
884 cudastruct->blocky = blocky;
885 }
886 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
887 if (flg) {
888 PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
889 cudastruct->kernelchoice = kernel;
890 if (kernel == 8) PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg));
891 }
892 PetscOptionsHeadEnd();
893 PetscFunctionReturn(PETSC_SUCCESS);
894 }
895
MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)896 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
897 {
898 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
899
900 PetscFunctionBegin;
901 PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
902 PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
903 PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
904 PetscFunctionReturn(PETSC_SUCCESS);
905 }
906
MatAssemblyEnd_SeqSELLCUDA(Mat A,MatAssemblyType mode)907 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
908 {
909 PetscFunctionBegin;
910 PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
911 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
912 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
913 if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqSELLCUDACopyToGPU(A));
914 A->ops->mult = MatMult_SeqSELLCUDA;
915 A->ops->multadd = MatMultAdd_SeqSELLCUDA;
916 PetscFunctionReturn(PETSC_SUCCESS);
917 }
918
MatZeroEntries_SeqSELLCUDA(Mat A)919 static PetscErrorCode MatZeroEntries_SeqSELLCUDA(Mat A)
920 {
921 PetscBool both = PETSC_FALSE;
922 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
923
924 PetscFunctionBegin;
925 if (A->factortype == MAT_FACTOR_NONE) {
926 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
927 if (cudastruct->val) {
928 both = PETSC_TRUE;
929 PetscCallCUDA(cudaMemset(cudastruct->val, 0, a->sliidx[a->totalslices] * sizeof(*cudastruct->val)));
930 }
931 }
932 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
933 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
934 else A->offloadmask = PETSC_OFFLOAD_CPU;
935 PetscFunctionReturn(PETSC_SUCCESS);
936 }
937
MatDestroy_SeqSELLCUDA(Mat A)938 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
939 {
940 PetscFunctionBegin;
941 if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr));
942 PetscCall(MatDestroy_SeqSELL(A));
943 PetscFunctionReturn(PETSC_SUCCESS);
944 }
945
946 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
MatDuplicate_SeqSELLCUDA(Mat A,MatDuplicateOption cpvalues,Mat * B)947 static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
948 {
949 PetscFunctionBegin;
950 PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
951 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
952 PetscFunctionReturn(PETSC_SUCCESS);
953 }
954
MatConvert_SeqSELL_SeqSELLCUDA(Mat B)955 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
956 {
957 Mat_SeqSELLCUDA *cudastruct;
958
959 PetscFunctionBegin;
960 PetscCall(PetscFree(B->defaultvectype));
961 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
962
963 if (!B->spptr) {
964 if (B->factortype == MAT_FACTOR_NONE) {
965 PetscCall(PetscNew(&cudastruct));
966 B->spptr = cudastruct;
967 }
968 }
969
970 B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA;
971 B->ops->destroy = MatDestroy_SeqSELLCUDA;
972 B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
973 B->ops->mult = MatMult_SeqSELLCUDA;
974 B->ops->multadd = MatMultAdd_SeqSELLCUDA;
975 B->ops->duplicate = MatDuplicate_SeqSELLCUDA;
976 B->ops->zeroentries = MatZeroEntries_SeqSELLCUDA;
977
978 /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
979 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
980
981 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
982 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
983 PetscFunctionReturn(PETSC_SUCCESS);
984 }
985
986 /*MC
987 MATSEQSELLCUDA - MATSELLCUDA = "(seq)sellcuda" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
988
989 Options Database Keys:
990 + -mat_type seqsellcuda - sets the matrix type to "seqsellcuda" during a call to `MatSetFromOptions()`
991 . -mat_sell_spmv_cuda_kernel - selects a spmv kernel for MatSELLCUDA
992 - -mat_sell_spmv_cuda_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension being 32
993
994 Level: beginner
995
996 .seealso: [](ch_matrices), `Mat`, `MATSELLCUDA`
997 M*/
998
MatCreate_SeqSELLCUDA(Mat B)999 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
1000 {
1001 PetscFunctionBegin;
1002 PetscCall(MatCreate_SeqSELL(B));
1003 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
1004 PetscCall(MatSetFromOptions(B));
1005 PetscFunctionReturn(PETSC_SUCCESS);
1006 }
1007