1 #include "hip/hip_runtime.h"
2 #include <hip/hip_runtime.h>
3
4 #include <petscdevice_hip.h>
5 #include <petsc/private/cupmatomics.hpp>
6 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/
7
8 #define WARP_SIZE 64
9
10 typedef struct {
11 PetscInt maxallocmat;
12 PetscInt totalentries;
13 PetscInt *colidx; /* column index array, device pointer */
14 MatScalar *val; /* value array, device pointer */
15 PetscInt totalslices;
16 PetscInt *sliidx; /* slice index array, device pointer */
17 PetscInt nonzerostate;
18 PetscInt kernelchoice;
19 PetscInt blocky;
20 PetscInt chunksperblock;
21 PetscInt totalchunks;
22 PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */
23 } Mat_SeqSELLHIP;
24
MatSeqSELLHIP_Destroy(Mat_SeqSELLHIP ** hipstruct)25 static PetscErrorCode MatSeqSELLHIP_Destroy(Mat_SeqSELLHIP **hipstruct)
26 {
27 PetscFunctionBegin;
28 if (*hipstruct) {
29 if ((*hipstruct)->colidx) PetscCallHIP(hipFree((*hipstruct)->colidx));
30 if ((*hipstruct)->val) PetscCallHIP(hipFree((*hipstruct)->val));
31 if ((*hipstruct)->sliidx) PetscCallHIP(hipFree((*hipstruct)->sliidx));
32 if ((*hipstruct)->chunk_slice_map) PetscCallHIP(hipFree((*hipstruct)->chunk_slice_map));
33 PetscCall(PetscFree(*hipstruct));
34 }
35 PetscFunctionReturn(PETSC_SUCCESS);
36 }
37
MatSeqSELLHIPCopyToGPU(Mat A)38 static PetscErrorCode MatSeqSELLHIPCopyToGPU(Mat A)
39 {
40 Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
41 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
42
43 PetscFunctionBegin;
44 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
45 PetscCall(PetscLogEventBegin(MAT_HIPCopyToGPU, A, 0, 0, 0));
46 if (A->assembled && A->nonzerostate == hipstruct->nonzerostate) {
47 /* copy values only */
48 PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), hipMemcpyHostToDevice));
49 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
50 } else {
51 if (hipstruct->colidx) PetscCallHIP(hipFree(hipstruct->colidx));
52 if (hipstruct->val) PetscCallHIP(hipFree(hipstruct->val));
53 if (hipstruct->sliidx) PetscCallHIP(hipFree(hipstruct->sliidx));
54 if (hipstruct->chunk_slice_map) PetscCallHIP(hipFree(hipstruct->chunk_slice_map));
55 hipstruct->maxallocmat = a->maxallocmat;
56 hipstruct->totalentries = a->sliidx[a->totalslices];
57 hipstruct->totalslices = a->totalslices;
58 hipstruct->totalchunks = a->totalchunks;
59 PetscCallHIP(hipMalloc((void **)&hipstruct->colidx, a->maxallocmat * sizeof(*hipstruct->colidx)));
60 PetscCallHIP(hipMalloc((void **)&hipstruct->val, a->maxallocmat * sizeof(*hipstruct->val)));
61 /* copy values, nz or maxallocmat? */
62 PetscCallHIP(hipMemcpy(hipstruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), hipMemcpyHostToDevice));
63 PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), hipMemcpyHostToDevice));
64
65 PetscCallHIP(hipMalloc((void **)&hipstruct->sliidx, (a->totalslices + 1) * sizeof(*hipstruct->sliidx)));
66 PetscCallHIP(hipMemcpy(hipstruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), hipMemcpyHostToDevice));
67 PetscCallHIP(hipMalloc((void **)&hipstruct->chunk_slice_map, a->totalchunks * sizeof(*hipstruct->chunk_slice_map)));
68 PetscCallHIP(hipMemcpy(hipstruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), hipMemcpyHostToDevice));
69 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
70 }
71 PetscCallHIP(WaitForHIP());
72 PetscCall(PetscLogEventEnd(MAT_HIPCopyToGPU, A, 0, 0, 0));
73 A->offloadmask = PETSC_OFFLOAD_BOTH;
74 }
75 PetscFunctionReturn(PETSC_SUCCESS);
76 }
77
matmult_seqsell_basic_kernel(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)78 static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
79 {
80 PetscInt i, row, slice_id, row_in_slice;
81 MatScalar sum;
82 /* one thread per row. */
83 row = blockIdx.x * blockDim.x + threadIdx.x;
84 if (row < nrows) {
85 slice_id = row / sliceheight;
86 row_in_slice = row % sliceheight;
87 sum = 0.0;
88 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
89 y[row] = sum;
90 }
91 }
92
matmultadd_seqsell_basic_kernel(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)93 static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
94 {
95 PetscInt i, row, slice_id, row_in_slice;
96 MatScalar sum;
97 /* one thread per row. */
98 row = blockIdx.x * blockDim.x + threadIdx.x;
99 if (row < nrows) {
100 slice_id = row / sliceheight;
101 row_in_slice = row % sliceheight;
102 sum = 0.0;
103 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
104 z[row] = y[row] + sum;
105 }
106 }
107
108 #if !defined(PETSC_USE_COMPLEX)
109 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wpass-failed")
110 /* use 1 block per slice, suitable for large slice width */
111 template <int BLOCKY>
matmult_seqsell_tiled_kernel9(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)112 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
113 {
114 __shared__ MatScalar shared[WARP_SIZE][BLOCKY];
115 PetscInt i, row, slice_id = blockIdx.x;
116 int tid = threadIdx.x + threadIdx.y * WARP_SIZE;
117 /* transposed index */
118 int tidx = tid % BLOCKY;
119 int tidy = tid / BLOCKY;
120 PetscScalar t = 0.0;
121
122 row = slice_id * sliceheight + threadIdx.x % sliceheight;
123 if (row < nrows) {
124 for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]];
125 }
126 #pragma unroll
127 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
128 /* transpose layout to reduce each row using warp shfl */
129 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
130 __syncthreads();
131 if (tidy < sliceheight) t = shared[tidy][tidx];
132 #pragma unroll
133 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY);
134 if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
135 __syncthreads();
136 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x];
137 }
138
139 /* use 1 block per slice, suitable for large slice width */
140 template <int BLOCKY>
matmultadd_seqsell_tiled_kernel9(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)141 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
142 {
143 __shared__ MatScalar shared[WARP_SIZE][BLOCKY];
144 PetscInt i, row, slice_id = blockIdx.x;
145 int tid = threadIdx.x + threadIdx.y * WARP_SIZE;
146 /* transposed index */
147 int tidx = tid % BLOCKY;
148 int tidy = tid / BLOCKY;
149 PetscScalar t = 0.0;
150
151 row = slice_id * sliceheight + threadIdx.x % sliceheight;
152 if (row < nrows) {
153 for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]];
154 }
155 #pragma unroll
156 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
157 /* transpose layout to reduce each row using warp shfl */
158 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
159 __syncthreads();
160 if (tidy < sliceheight) t = shared[tidy][tidx];
161 #pragma unroll
162 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY);
163 if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
164 __syncthreads();
165 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][threadIdx.x];
166 }
167
168 template <int BLOCKY>
segment_scan(PetscInt flag[],MatScalar shared[],PetscScalar * val)169 __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
170 {
171 bool head = true;
172 #pragma unroll
173 for (int i = 1; i < BLOCKY * 2; i <<= 1) {
174 int halfwarpid = threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2);
175 shared[threadIdx.x + threadIdx.y * WARP_SIZE] = 0;
176 if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
177 shared[threadIdx.x + threadIdx.y * WARP_SIZE] = *val;
178 if (i == 1) head = false;
179 }
180 __syncthreads();
181 if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * WARP_SIZE + i * WARP_SIZE];
182 __syncthreads();
183 }
184 return head;
185 }
186
187 /* load-balancing version. Chunksize is equal to the number of threads per block */
188 template <int BLOCKY>
matmult_seqsell_tiled_kernel8(PetscInt nrows,PetscInt sliceheight,PetscInt chunksperblock,PetscInt totalchunks,const PetscInt * chunk_slice_map,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)189 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
190 {
191 __shared__ MatScalar shared[BLOCKY * WARP_SIZE];
192 PetscInt gid, row, start_slice, cid;
193 PetscScalar t = 0.0;
194 AtomicAdd<MatScalar> atomAdd;
195 /* zero out y */
196 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) {
197 gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
198 if (gid < nrows) y[gid] = 0.0;
199 }
200 for (int iter = 0; iter < chunksperblock; iter++) {
201 cid = blockIdx.x * chunksperblock + iter; /* chunk id */
202 if (cid < totalchunks) {
203 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
204 gid = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
205 if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
206 __shared__ PetscInt flag[BLOCKY * 2];
207 bool write;
208 PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
209 /* find out the slice that this element belongs to */
210 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
211 if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id;
212 row = slice_id * sliceheight + threadIdx.x % sliceheight;
213 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
214 __syncthreads();
215 write = segment_scan<BLOCKY>(flag, shared, &t);
216 if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
217 t = 0.0;
218 } else { /* this iteration covers only one slice */
219 row = start_slice * sliceheight + threadIdx.x % sliceheight;
220 if (row < nrows) t += aval[gid] * x[acolidx[gid]];
221 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
222 int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
223 /* reduction and write to output vector */
224 #pragma unroll
225 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
226 /* transpose layout to reduce each row using warp shfl */
227 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
228 __syncthreads();
229 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
230 #pragma unroll
231 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY);
232 if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
233 __syncthreads();
234 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
235 t = 0.0;
236 }
237 }
238 }
239 }
240 }
241
242 /* load-balancing version. Chunksize is equal to the number of threads per block */
243 template <int BLOCKY>
matmultadd_seqsell_tiled_kernel8(PetscInt nrows,PetscInt sliceheight,PetscInt chunksperblock,PetscInt totalchunks,const PetscInt * chunk_slice_map,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)244 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
245 {
246 __shared__ MatScalar shared[BLOCKY * WARP_SIZE];
247 PetscInt gid, row, start_slice, cid;
248 PetscScalar t = 0.0;
249 AtomicAdd<MatScalar> atomAdd;
250 /* copy y to z */
251 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) {
252 gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
253 if (gid < nrows) z[gid] = y[gid];
254 }
255 for (int iter = 0; iter < chunksperblock; iter++) {
256 cid = blockIdx.x * chunksperblock + iter; /* chunk id */
257 if (cid < totalchunks) {
258 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
259 gid = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
260 if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
261 __shared__ PetscInt flag[BLOCKY * 2];
262 bool write;
263 PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
264 /* find out the slice that this element belongs to */
265 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
266 if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id;
267 row = slice_id * sliceheight + threadIdx.x % sliceheight;
268 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
269 __syncthreads();
270 write = segment_scan<BLOCKY>(flag, shared, &t);
271 if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
272 t = 0.0;
273 } else { /* this iteration covers only one slice */
274 row = start_slice * sliceheight + threadIdx.x % sliceheight;
275 if (row < nrows) t += aval[gid] * x[acolidx[gid]];
276 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
277 int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
278 /* reduction and write to output vector */
279 #pragma unroll
280 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
281 /* transpose layout to reduce each row using warp shfl */
282 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
283 __syncthreads();
284 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
285 #pragma unroll
286 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY);
287 if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
288 __syncthreads();
289 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
290 t = 0.0;
291 }
292 }
293 }
294 }
295 }
296
297 /* use 1 warp per slice, suitable for small slice width */
matmult_seqsell_tiled_kernel7(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)298 static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
299 {
300 PetscInt i, row, slice_id;
301 slice_id = blockIdx.x * blockDim.y + threadIdx.y;
302 row = slice_id * sliceheight + threadIdx.x % sliceheight;
303 double t = 0.0;
304 if (row < nrows) {
305 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]];
306 }
307 #pragma unroll
308 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
309 if (row < nrows && threadIdx.x < sliceheight) y[row] = t;
310 }
311
312 /* use 1 warp per slice, suitable for small slice width */
matmultadd_seqsell_tiled_kernel7(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)313 static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
314 {
315 PetscInt i, row, slice_id;
316 slice_id = blockIdx.x * blockDim.y + threadIdx.y;
317 row = slice_id * sliceheight + threadIdx.x % sliceheight;
318 double t = 0.0;
319 if (row < nrows) {
320 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]];
321 }
322 #pragma unroll
323 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset);
324 if (row < nrows && threadIdx.x < sliceheight) z[row] = y[row] + t;
325 }
PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()326 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
327 #endif
328
329 /*********** Kernel 2-6 require a slice height smaller than 512, 256, 128, 64, 32, espectively. They are kept only for performance comparison **********/
330
331 static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
332 {
333 __shared__ MatScalar shared[32 * 16];
334 PetscInt i, row, slice_id, row_in_slice;
335 /* multiple threads per row. */
336 row = blockIdx.x * blockDim.x + threadIdx.x;
337 if (row < nrows) {
338 slice_id = row / sliceheight;
339 row_in_slice = row % sliceheight;
340
341 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
342 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
343 __syncthreads();
344 if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
345 __syncthreads();
346 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
347 __syncthreads();
348 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
349 __syncthreads();
350 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
351 __syncthreads();
352 if (threadIdx.y < 1) {
353 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
354 y[row] = shared[threadIdx.x];
355 }
356 }
357 }
358
matmult_seqsell_tiled_kernel5(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)359 static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
360 {
361 __shared__ MatScalar shared[32 * 16];
362 PetscInt i, row, slice_id, row_in_slice;
363 /* multiple threads per row. */
364 row = blockIdx.x * blockDim.x + threadIdx.x;
365 if (row < nrows) {
366 slice_id = row / sliceheight;
367 row_in_slice = row % sliceheight;
368
369 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
370 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
371 __syncthreads();
372 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
373 __syncthreads();
374 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
375 __syncthreads();
376 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
377 __syncthreads();
378 if (threadIdx.y < 1) {
379 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
380 y[row] = shared[threadIdx.x];
381 }
382 }
383 }
384
matmult_seqsell_tiled_kernel4(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)385 static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
386 {
387 __shared__ MatScalar shared[32 * 16];
388 PetscInt i, row, slice_id, row_in_slice;
389 /* multiple threads per row. */
390 row = blockIdx.x * blockDim.x + threadIdx.x;
391 if (row < nrows) {
392 slice_id = row / sliceheight;
393 row_in_slice = row % sliceheight;
394
395 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
396 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
397 __syncthreads();
398 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
399 __syncthreads();
400 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
401 __syncthreads();
402 if (threadIdx.y < 1) {
403 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
404 y[row] = shared[threadIdx.x];
405 }
406 }
407 }
408
matmult_seqsell_tiled_kernel3(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)409 static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
410 {
411 __shared__ MatScalar shared[32 * 16];
412 PetscInt i, row, slice_id, row_in_slice;
413 /* multiple threads per row. */
414 row = blockIdx.x * blockDim.x + threadIdx.x;
415 if (row < nrows) {
416 slice_id = row / sliceheight;
417 row_in_slice = row % sliceheight;
418
419 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
420 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
421 __syncthreads();
422 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
423 __syncthreads();
424 if (threadIdx.y < 1) {
425 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
426 y[row] = shared[threadIdx.x];
427 }
428 }
429 }
430
matmult_seqsell_tiled_kernel2(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)431 static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
432 {
433 __shared__ MatScalar shared[32 * 16];
434 PetscInt i, row, slice_id, row_in_slice;
435 /* multiple threads per row. */
436 row = blockIdx.x * blockDim.x + threadIdx.x;
437 if (row < nrows) {
438 slice_id = row / sliceheight;
439 row_in_slice = row % sliceheight;
440
441 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
442 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
443 __syncthreads();
444 if (threadIdx.y < 1) {
445 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
446 y[row] = shared[threadIdx.x];
447 }
448 }
449 }
450
matmultadd_seqsell_tiled_kernel6(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)451 static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
452 {
453 __shared__ MatScalar shared[32 * 16];
454 PetscInt i, row, slice_id, row_in_slice;
455 /* multiple threads per row. */
456 row = blockIdx.x * blockDim.x + threadIdx.x;
457 if (row < nrows) {
458 slice_id = row / sliceheight;
459 row_in_slice = row % sliceheight;
460
461 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
462 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
463 __syncthreads();
464 if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
465 __syncthreads();
466 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
467 __syncthreads();
468 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
469 __syncthreads();
470 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
471 __syncthreads();
472 if (threadIdx.y < 1) {
473 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
474 z[row] = y[row] + shared[threadIdx.x];
475 }
476 }
477 }
478
matmultadd_seqsell_tiled_kernel5(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)479 static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
480 {
481 __shared__ MatScalar shared[32 * 16];
482 PetscInt i, row, slice_id, row_in_slice;
483 /* multiple threads per row. */
484 row = blockIdx.x * blockDim.x + threadIdx.x;
485 if (row < nrows) {
486 slice_id = row / sliceheight;
487 row_in_slice = row % sliceheight;
488
489 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
490 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
491 __syncthreads();
492 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
493 __syncthreads();
494 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
495 __syncthreads();
496 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
497 __syncthreads();
498 if (threadIdx.y < 1) {
499 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
500 z[row] = y[row] + shared[threadIdx.x];
501 }
502 }
503 }
504
matmultadd_seqsell_tiled_kernel4(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)505 static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
506 {
507 __shared__ MatScalar shared[32 * 16];
508 PetscInt i, row, slice_id, row_in_slice;
509 /* multiple threads per row. */
510 row = blockIdx.x * blockDim.x + threadIdx.x;
511 if (row < nrows) {
512 slice_id = row / sliceheight;
513 row_in_slice = row % sliceheight;
514
515 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
516 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
517 __syncthreads();
518 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
519 __syncthreads();
520 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
521 __syncthreads();
522 if (threadIdx.y < 1) {
523 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
524 z[row] = y[row] + shared[threadIdx.x];
525 }
526 }
527 }
528
matmultadd_seqsell_tiled_kernel3(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)529 static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
530 {
531 __shared__ MatScalar shared[32 * 16];
532 PetscInt i, row, slice_id, row_in_slice;
533 /* multiple threads per row. */
534 row = blockIdx.x * blockDim.x + threadIdx.x;
535 if (row < nrows) {
536 slice_id = row / sliceheight;
537 row_in_slice = row % sliceheight;
538
539 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
540 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
541 __syncthreads();
542 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
543 __syncthreads();
544 if (threadIdx.y < 1) {
545 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
546 z[row] = y[row] + shared[threadIdx.x];
547 }
548 }
549 }
550
matmultadd_seqsell_tiled_kernel2(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)551 static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
552 {
553 __shared__ MatScalar shared[32 * 16];
554 PetscInt i, row, slice_id, row_in_slice;
555 /* multiple threads per row. */
556 row = blockIdx.x * blockDim.x + threadIdx.x;
557 if (row < nrows) {
558 slice_id = row / sliceheight;
559 row_in_slice = row % sliceheight;
560
561 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
562 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
563 __syncthreads();
564 if (threadIdx.y < 1) {
565 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
566 z[row] = y[row] + shared[threadIdx.x];
567 }
568 }
569 }
570
MatMult_SeqSELLHIP(Mat A,Vec xx,Vec yy)571 static PetscErrorCode MatMult_SeqSELLHIP(Mat A, Vec xx, Vec yy)
572 {
573 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
574 Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
575 PetscScalar *y;
576 const PetscScalar *x;
577 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight;
578 MatScalar *aval;
579 PetscInt *acolidx;
580 PetscInt *sliidx;
581 PetscInt nblocks, blocksize = 512; /* blocksize is fixed to be 512 */
582 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
583 #if !defined(PETSC_USE_COMPLEX)
584 PetscInt chunksperblock, nchunks, *chunk_slice_map;
585 PetscReal maxoveravg;
586 #endif
587
588 PetscFunctionBegin;
589 PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
590 PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight > 32), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be less than 32, but the current slice height is %" PetscInt_FMT, sliceheight);
591 PetscCall(MatSeqSELLHIPCopyToGPU(A));
592 /* hipstruct may not be available until MatSeqSELLHIPCopyToGPU() is called */
593 aval = hipstruct->val;
594 acolidx = hipstruct->colidx;
595 sliidx = hipstruct->sliidx;
596
597 PetscCall(VecHIPGetArrayRead(xx, &x));
598 PetscCall(VecHIPGetArrayWrite(yy, &y));
599 PetscCall(PetscLogGpuTimeBegin());
600
601 switch (hipstruct->kernelchoice) {
602 #if !defined(PETSC_USE_COMPLEX)
603 case 9: /* 1 slice per block */
604 nblocks = 1 + (nrows - 1) / sliceheight;
605 if (hipstruct->blocky == 2) {
606 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
607 } else if (hipstruct->blocky == 4) {
608 matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
609 } else if (hipstruct->blocky == 8) {
610 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
611 } else if (hipstruct->blocky == 16) {
612 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
613 } else {
614 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
615 }
616 break;
617 case 7: /* each block handles blocky slices */
618 nblocks = 1 + (nrows - 1) / (hipstruct->blocky * sliceheight);
619 if (hipstruct->blocky == 2) {
620 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
621 } else if (hipstruct->blocky == 4) {
622 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
623 } else if (hipstruct->blocky == 8) {
624 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
625 } else if (hipstruct->blocky == 16) {
626 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
627 } else {
628 nblocks = 1 + (nrows - 1) / (2 * sliceheight);
629 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
630 }
631 break;
632 #endif
633 case 6:
634 nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if sliceheight=32 */
635 matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
636 break;
637 case 5:
638 nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if sliceheight=32*/
639 matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
640 break;
641 case 4:
642 nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if sliceheight=32 */
643 matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
644 break;
645 case 3:
646 nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if sliceheight=32 */
647 matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
648 break;
649 case 2: /* 16 slices per block if sliceheight=32 */
650 nblocks = 1 + (nrows - 1) / (blocksize / 2);
651 matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
652 break;
653 case 1: /* 32 slices per block if sliceheight=32 */
654 nblocks = 1 + (nrows - 1) / blocksize;
655 matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
656 break;
657 #if !defined(PETSC_USE_COMPLEX)
658 case 0:
659 maxoveravg = a->maxslicewidth / a->avgslicewidth;
660 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
661 /* each block handles approximately one slice */
662 PetscInt blocky = a->chunksize / 32;
663 nchunks = hipstruct->totalchunks;
664 chunksperblock = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
665 nblocks = 1 + (nchunks - 1) / chunksperblock;
666 chunk_slice_map = hipstruct->chunk_slice_map;
667 if (blocky == 2) {
668 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
669 } else if (blocky == 4) {
670 matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
671 } else if (blocky == 8) {
672 matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
673 } else if (blocky == 16) {
674 matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
675 } else {
676 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
677 }
678 } else {
679 PetscInt avgslicesize = sliceheight * a->avgslicewidth;
680 if (avgslicesize <= 432) {
681 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
682 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
683 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
684 } else {
685 nblocks = 1 + (nrows - 1) / sliceheight;
686 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
687 }
688 } else if (avgslicesize <= 2400) {
689 nblocks = 1 + (nrows - 1) / sliceheight;
690 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
691 } else {
692 nblocks = 1 + (nrows - 1) / sliceheight;
693 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
694 }
695 }
696 break;
697 #endif
698 default:
699 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice);
700 }
701 PetscCall(PetscLogGpuTimeEnd());
702 PetscCall(VecHIPRestoreArrayRead(xx, &x));
703 PetscCall(VecHIPRestoreArrayWrite(yy, &y));
704 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
705 PetscFunctionReturn(PETSC_SUCCESS);
706 }
707
MatMultAdd_SeqSELLHIP(Mat A,Vec xx,Vec yy,Vec zz)708 static PetscErrorCode MatMultAdd_SeqSELLHIP(Mat A, Vec xx, Vec yy, Vec zz)
709 {
710 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
711 Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
712 PetscScalar *z;
713 const PetscScalar *y, *x;
714 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight;
715 MatScalar *aval = hipstruct->val;
716 PetscInt *acolidx = hipstruct->colidx;
717 PetscInt *sliidx = hipstruct->sliidx;
718 #if !defined(PETSC_USE_COMPLEX)
719 PetscReal maxoveravg;
720 PetscInt chunksperblock, nchunks, *chunk_slice_map;
721 PetscInt blocky = hipstruct->blocky;
722 #endif
723
724 PetscFunctionBegin;
725 PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
726 PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight != sliceheight), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
727 PetscCall(MatSeqSELLHIPCopyToGPU(A));
728 if (a->nz) {
729 PetscInt nblocks, blocksize = 512;
730 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
731 PetscCall(VecHIPGetArrayRead(xx, &x));
732 PetscCall(VecHIPGetArrayRead(yy, &y));
733 PetscCall(VecHIPGetArrayWrite(zz, &z));
734 PetscCall(PetscLogGpuTimeBegin());
735
736 switch (hipstruct->kernelchoice) {
737 #if !defined(PETSC_USE_COMPLEX)
738 case 9:
739 nblocks = 1 + (nrows - 1) / sliceheight;
740 if (blocky == 2) {
741 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
742 } else if (blocky == 4) {
743 matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
744 } else if (blocky == 8) {
745 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
746 } else if (blocky == 16) {
747 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
748 } else {
749 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
750 }
751 break;
752 case 8:
753 /* each block handles approximately one slice */
754 nchunks = hipstruct->totalchunks;
755 blocky = a->chunksize / 32;
756 chunksperblock = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
757 nblocks = 1 + (nchunks - 1) / chunksperblock;
758 chunk_slice_map = hipstruct->chunk_slice_map;
759 if (blocky == 2) {
760 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
761 } else if (blocky == 4) {
762 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
763 } else if (blocky == 8) {
764 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
765 } else if (blocky == 16) {
766 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
767 } else {
768 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
769 }
770 break;
771 case 7:
772 nblocks = 1 + (nrows - 1) / (blocky * sliceheight);
773 if (blocky == 2) {
774 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
775 } else if (blocky == 4) {
776 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
777 } else if (blocky == 8) {
778 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
779 } else if (blocky == 16) {
780 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
781 } else {
782 nblocks = 1 + (nrows - 1) / (2 * sliceheight);
783 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
784 }
785 break;
786 #endif
787 case 6:
788 nblocks = 1 + (nrows - 1) / (blocksize / 32);
789 matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
790 break;
791 case 5:
792 nblocks = 1 + (nrows - 1) / (blocksize / 16);
793 matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
794 break;
795 case 4:
796 nblocks = 1 + (nrows - 1) / (blocksize / 8);
797 matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
798 break;
799 case 3:
800 nblocks = 1 + (nrows - 1) / (blocksize / 4);
801 matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
802 break;
803 case 2:
804 nblocks = 1 + (nrows - 1) / (blocksize / 2);
805 matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
806 break;
807 case 1:
808 nblocks = 1 + (nrows - 1) / blocksize;
809 matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
810 break;
811 #if !defined(PETSC_USE_COMPLEX)
812 case 0:
813 maxoveravg = a->maxslicewidth / a->avgslicewidth;
814 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
815 /* each block handles approximately one slice */
816 nchunks = hipstruct->totalchunks;
817 blocky = a->chunksize / 32;
818 chunksperblock = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
819 nblocks = 1 + (nchunks - 1) / chunksperblock;
820 chunk_slice_map = hipstruct->chunk_slice_map;
821 if (blocky == 2) {
822 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
823 } else if (blocky == 4) {
824 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
825 } else if (blocky == 8) {
826 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
827 } else if (blocky == 16) {
828 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
829 } else {
830 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
831 }
832 } else {
833 PetscInt avgslicesize = sliceheight * a->avgslicewidth;
834 if (avgslicesize <= 432) {
835 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
836 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
837 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
838 } else {
839 nblocks = 1 + (nrows - 1) / sliceheight;
840 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
841 }
842 } else if (avgslicesize <= 2400) {
843 nblocks = 1 + (nrows - 1) / sliceheight;
844 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
845 } else {
846 nblocks = 1 + (nrows - 1) / sliceheight;
847 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
848 }
849 }
850 break;
851 #endif
852 default:
853 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice);
854 }
855 PetscCall(PetscLogGpuTimeEnd());
856 PetscCall(VecHIPRestoreArrayRead(xx, &x));
857 PetscCall(VecHIPRestoreArrayRead(yy, &y));
858 PetscCall(VecHIPRestoreArrayWrite(zz, &z));
859 PetscCall(PetscLogGpuFlops(2.0 * a->nz));
860 } else {
861 PetscCall(VecCopy(yy, zz));
862 }
863 PetscFunctionReturn(PETSC_SUCCESS);
864 }
865
MatSetFromOptions_SeqSELLHIP(Mat A,PetscOptionItems PetscOptionsObject)866 static PetscErrorCode MatSetFromOptions_SeqSELLHIP(Mat A, PetscOptionItems PetscOptionsObject)
867 {
868 Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
869 PetscInt kernel, blocky;
870 PetscBool flg;
871
872 PetscFunctionBegin;
873 PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLHIP options");
874 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_blocky", &blocky, &flg));
875 if (flg) {
876 PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
877 hipstruct->blocky = blocky;
878 }
879 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_kernel", &kernel, &flg));
880 if (flg) {
881 PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
882 hipstruct->kernelchoice = kernel;
883 if (kernel == 8) PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_chunksperblock", &hipstruct->chunksperblock, &flg));
884 }
885 PetscOptionsHeadEnd();
886 PetscFunctionReturn(PETSC_SUCCESS);
887 }
888
MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)889 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
890 {
891 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
892
893 PetscFunctionBegin;
894 PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
895 PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
896 PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
897 PetscFunctionReturn(PETSC_SUCCESS);
898 }
899
MatAssemblyEnd_SeqSELLHIP(Mat A,MatAssemblyType mode)900 static PetscErrorCode MatAssemblyEnd_SeqSELLHIP(Mat A, MatAssemblyType mode)
901 {
902 PetscFunctionBegin;
903 PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
904 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
905 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
906 if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqSELLHIPCopyToGPU(A));
907 A->ops->mult = MatMult_SeqSELLHIP;
908 A->ops->multadd = MatMultAdd_SeqSELLHIP;
909 PetscFunctionReturn(PETSC_SUCCESS);
910 }
911
MatZeroEntries_SeqSELLHIP(Mat A)912 static PetscErrorCode MatZeroEntries_SeqSELLHIP(Mat A)
913 {
914 PetscBool both = PETSC_FALSE;
915 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
916
917 PetscFunctionBegin;
918 if (A->factortype == MAT_FACTOR_NONE) {
919 Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
920 if (hipstruct->val) {
921 both = PETSC_TRUE;
922 PetscCallHIP(hipMemset(hipstruct->val, 0, a->sliidx[a->totalslices] * sizeof(*hipstruct->val)));
923 }
924 }
925 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
926 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
927 else A->offloadmask = PETSC_OFFLOAD_CPU;
928 PetscFunctionReturn(PETSC_SUCCESS);
929 }
930
MatDestroy_SeqSELLHIP(Mat A)931 static PetscErrorCode MatDestroy_SeqSELLHIP(Mat A)
932 {
933 PetscFunctionBegin;
934 if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLHIP_Destroy((Mat_SeqSELLHIP **)&A->spptr));
935 PetscCall(MatDestroy_SeqSELL(A));
936 PetscFunctionReturn(PETSC_SUCCESS);
937 }
938
939 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat);
MatDuplicate_SeqSELLHIP(Mat A,MatDuplicateOption cpvalues,Mat * B)940 static PetscErrorCode MatDuplicate_SeqSELLHIP(Mat A, MatDuplicateOption cpvalues, Mat *B)
941 {
942 PetscFunctionBegin;
943 PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
944 PetscCall(MatConvert_SeqSELL_SeqSELLHIP(*B));
945 PetscFunctionReturn(PETSC_SUCCESS);
946 }
947
MatConvert_SeqSELL_SeqSELLHIP(Mat B)948 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat B)
949 {
950 Mat_SeqSELLHIP *hipstruct;
951
952 PetscFunctionBegin;
953 PetscCall(PetscFree(B->defaultvectype));
954 PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
955
956 if (!B->spptr) {
957 if (B->factortype == MAT_FACTOR_NONE) {
958 PetscCall(PetscNew(&hipstruct));
959 B->spptr = hipstruct;
960 }
961 }
962
963 B->ops->assemblyend = MatAssemblyEnd_SeqSELLHIP;
964 B->ops->destroy = MatDestroy_SeqSELLHIP;
965 B->ops->setfromoptions = MatSetFromOptions_SeqSELLHIP;
966 B->ops->mult = MatMult_SeqSELLHIP;
967 B->ops->multadd = MatMultAdd_SeqSELLHIP;
968 B->ops->duplicate = MatDuplicate_SeqSELLHIP;
969 B->ops->zeroentries = MatZeroEntries_SeqSELLHIP;
970
971 /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
972 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
973
974 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLHIP));
975 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
976 PetscFunctionReturn(PETSC_SUCCESS);
977 }
978
979 /*MC
980 MATSEQSELLHIP - MATSELLHIP = "(seq)sellhip" - A matrix type to be used for sparse matrices on AMD GPUs.
981
982 Options Database Keys:
983 + -mat_type seqsellhip - sets the matrix type to "seqsellhip" during a call to `MatSetFromOptions()`
984 . -mat_sell_spmv_hip_kernel - selects a spmv kernel for MatSELLHIP
985 - -mat_sell_spmv_hip_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension equal to the wrap size (normally 64 for AMD GPUs)
986
987 Level: beginner
988
989 .seealso: [](ch_matrices), `Mat`, `MATSELLHIP`
990 M*/
991
MatCreate_SeqSELLHIP(Mat B)992 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLHIP(Mat B)
993 {
994 PetscFunctionBegin;
995 PetscCall(MatCreate_SeqSELL(B));
996 PetscCall(MatConvert_SeqSELL_SeqSELLHIP(B));
997 PetscCall(MatSetFromOptions(B));
998 PetscFunctionReturn(PETSC_SUCCESS);
999 }
1000