xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 7f296bb328fcd4c99f2da7bfe8ba7ed8a4ebceee)
1 /*
2   Defines the basic matrix operations for the SELL matrix storage format.
3 */
4 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
5 #include <petscblaslapack.h>
6 #include <petsc/private/kernels/blocktranspose.h>
7 
8 static PetscBool  cited      = PETSC_FALSE;
9 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
10                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
11                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
12                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
13                                " year = 2018\n"
14                                "}\n";
15 
16 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
17 
18   #include <immintrin.h>
19 
20   #if !defined(_MM_SCALE_8)
21     #define _MM_SCALE_8 8
22   #endif
23 
24   #if defined(__AVX512F__)
25     /* these do not work
26    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
27    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
28   */
29     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
30       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
31       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
32       vec_vals = _mm512_loadu_pd(aval); \
33       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
34       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
35   #elif defined(__AVX2__) && defined(__FMA__)
36     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
37       vec_vals = _mm256_loadu_pd(aval); \
38       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
39       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
40       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
41   #endif
42 #endif /* PETSC_HAVE_IMMINTRIN_H */
43 
44 /*@
45   MatSeqSELLSetPreallocation - For good matrix assembly performance
46   the user should preallocate the matrix storage by setting the parameter `nz`
47   (or the array `nnz`).
48 
49   Collective
50 
51   Input Parameters:
52 + B       - The `MATSEQSELL` matrix
53 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
54 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
55 
56   Level: intermediate
57 
58   Notes:
59   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
60   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
61   allocation.
62 
63   You can call `MatGetInfo()` to get information on how effective the preallocation was;
64   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
65   You can also run with the option `-info` and look for messages with the string
66   malloc in them to see if additional memory allocation was needed.
67 
68   Developer Notes:
69   Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
70   entries or columns indices.
71 
72   The maximum number of nonzeos in any row should be as accurate as possible.
73   If it is underestimated, you will get bad performance due to reallocation
74   (`MatSeqXSELLReallocateSELL()`).
75 
76 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
77  @*/
78 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
79 {
80   PetscFunctionBegin;
81   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
82   PetscValidType(B, 1);
83   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
84   PetscFunctionReturn(PETSC_SUCCESS);
85 }
86 
87 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
88 {
89   Mat_SeqSELL *b;
90   PetscInt     i, j, totalslices;
91 #if defined(PETSC_HAVE_CUPM)
92   PetscInt rlenmax = 0;
93 #endif
94   PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
95 
96   PetscFunctionBegin;
97   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
98   if (maxallocrow == MAT_SKIP_ALLOCATION) {
99     skipallocation = PETSC_TRUE;
100     maxallocrow    = 0;
101   }
102 
103   PetscCall(PetscLayoutSetUp(B->rmap));
104   PetscCall(PetscLayoutSetUp(B->cmap));
105 
106   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
107   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
108   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
109   if (rlen) {
110     for (i = 0; i < B->rmap->n; i++) {
111       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
112       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
113     }
114   }
115 
116   B->preallocated = PETSC_TRUE;
117 
118   b = (Mat_SeqSELL *)B->data;
119 
120   if (!b->sliceheight) { /* not set yet */
121 #if defined(PETSC_HAVE_CUPM)
122     b->sliceheight = 16;
123 #else
124     b->sliceheight = 8;
125 #endif
126   }
127   totalslices    = PetscCeilInt(B->rmap->n, b->sliceheight);
128   b->totalslices = totalslices;
129   if (!skipallocation) {
130     if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
131 
132     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
133       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
134     }
135     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
136       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
137       else if (maxallocrow < 0) maxallocrow = 1;
138 #if defined(PETSC_HAVE_CUPM)
139       rlenmax = maxallocrow;
140       /* Pad the slice to DEVICE_MEM_ALIGN */
141       while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
142 #endif
143       for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
144     } else {
145 #if defined(PETSC_HAVE_CUPM)
146       PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
147 #endif
148       maxallocrow  = 0;
149       b->sliidx[0] = 0;
150       for (i = 1; i < totalslices; i++) {
151         b->sliidx[i] = 0;
152         for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); }
153 #if defined(PETSC_HAVE_CUPM)
154         if (mul != 0) { /* Pad the slice to DEVICE_MEM_ALIGN if sliceheight < DEVICE_MEM_ALIGN */
155           rlenmax      = PetscMax(b->sliidx[i], rlenmax);
156           b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
157         }
158 #endif
159         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
160         PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
161       }
162       /* last slice */
163       b->sliidx[totalslices] = 0;
164       for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
165 #if defined(PETSC_HAVE_CUPM)
166       if (mul != 0) {
167         rlenmax                = PetscMax(b->sliidx[i], rlenmax);
168         b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
169       }
170 #endif
171       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
172       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
173     }
174 
175     /* allocate space for val, colidx, rlen */
176     /* FIXME: should B's old memory be unlogged? */
177     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
178     /* FIXME: assuming an element of the bit array takes 8 bits */
179     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
180     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
181     PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
182 
183     b->singlemalloc = PETSC_TRUE;
184     b->free_val     = PETSC_TRUE;
185     b->free_colidx  = PETSC_TRUE;
186   } else {
187     b->free_val    = PETSC_FALSE;
188     b->free_colidx = PETSC_FALSE;
189   }
190 
191   b->nz          = 0;
192   b->maxallocrow = maxallocrow;
193 #if defined(PETSC_HAVE_CUPM)
194   b->rlenmax = rlenmax;
195 #else
196   b->rlenmax = maxallocrow;
197 #endif
198   b->maxallocmat      = b->sliidx[totalslices];
199   B->info.nz_unneeded = (double)b->maxallocmat;
200   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
201   PetscFunctionReturn(PETSC_SUCCESS);
202 }
203 
204 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
205 {
206   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
207   PetscInt     shift;
208 
209   PetscFunctionBegin;
210   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
211   if (nz) *nz = a->rlen[row];
212   shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
213   if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); }
214   if (idx) {
215     PetscInt j;
216     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
217     *idx = a->getrowcols;
218   }
219   if (v) {
220     PetscInt j;
221     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
222     *v = a->getrowvals;
223   }
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
227 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
228 {
229   PetscFunctionBegin;
230   PetscFunctionReturn(PETSC_SUCCESS);
231 }
232 
233 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
234 {
235   Mat          B;
236   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
237   PetscInt     i;
238 
239   PetscFunctionBegin;
240   if (reuse == MAT_REUSE_MATRIX) {
241     B = *newmat;
242     PetscCall(MatZeroEntries(B));
243   } else {
244     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
245     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
246     PetscCall(MatSetType(B, MATSEQAIJ));
247     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
248   }
249 
250   for (i = 0; i < A->rmap->n; i++) {
251     PetscInt     nz = 0, *cols = NULL;
252     PetscScalar *vals = NULL;
253 
254     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
255     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
256     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
257   }
258 
259   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
260   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
261   B->rmap->bs = A->rmap->bs;
262 
263   if (reuse == MAT_INPLACE_MATRIX) {
264     PetscCall(MatHeaderReplace(A, &B));
265   } else {
266     *newmat = B;
267   }
268   PetscFunctionReturn(PETSC_SUCCESS);
269 }
270 
271 #include <../src/mat/impls/aij/seq/aij.h>
272 
273 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
274 {
275   Mat                B;
276   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
277   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
278   const PetscInt    *cols;
279   const PetscScalar *vals;
280 
281   PetscFunctionBegin;
282   if (reuse == MAT_REUSE_MATRIX) {
283     B = *newmat;
284   } else {
285     if (PetscDefined(USE_DEBUG) || !a->ilen) {
286       PetscCall(PetscMalloc1(m, &rowlengths));
287       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
288     }
289     if (PetscDefined(USE_DEBUG) && a->ilen) {
290       PetscBool eq;
291       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
292       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
293       PetscCall(PetscFree(rowlengths));
294       rowlengths = a->ilen;
295     } else if (a->ilen) rowlengths = a->ilen;
296     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
297     PetscCall(MatSetSizes(B, m, n, m, n));
298     PetscCall(MatSetType(B, MATSEQSELL));
299     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
300     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
301   }
302 
303   for (row = 0; row < m; row++) {
304     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
305     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
306     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
307   }
308   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
309   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
310   B->rmap->bs = A->rmap->bs;
311 
312   if (reuse == MAT_INPLACE_MATRIX) {
313     PetscCall(MatHeaderReplace(A, &B));
314   } else {
315     *newmat = B;
316   }
317   PetscFunctionReturn(PETSC_SUCCESS);
318 }
319 
320 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
321 {
322   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
323   PetscScalar       *y;
324   const PetscScalar *x;
325   const MatScalar   *aval        = a->val;
326   PetscInt           totalslices = a->totalslices;
327   const PetscInt    *acolidx     = a->colidx;
328   PetscInt           i, j;
329 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
330   __m512d  vec_x, vec_y, vec_vals;
331   __m256i  vec_idx;
332   __mmask8 mask;
333   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
334   __m256i  vec_idx2, vec_idx3, vec_idx4;
335 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
336   __m128i   vec_idx;
337   __m256d   vec_x, vec_y, vec_y2, vec_vals;
338   MatScalar yval;
339   PetscInt  r, rows_left, row, nnz_in_row;
340 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
341   __m128d   vec_x_tmp;
342   __m256d   vec_x, vec_y, vec_y2, vec_vals;
343   MatScalar yval;
344   PetscInt  r, rows_left, row, nnz_in_row;
345 #else
346   PetscInt     k, sliceheight = a->sliceheight;
347   PetscScalar *sum;
348 #endif
349 
350 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
351   #pragma disjoint(*x, *y, *aval)
352 #endif
353 
354   PetscFunctionBegin;
355   PetscCall(VecGetArrayRead(xx, &x));
356   PetscCall(VecGetArray(yy, &y));
357 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
358   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
359   for (i = 0; i < totalslices; i++) { /* loop over slices */
360     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
361     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
362 
363     vec_y  = _mm512_setzero_pd();
364     vec_y2 = _mm512_setzero_pd();
365     vec_y3 = _mm512_setzero_pd();
366     vec_y4 = _mm512_setzero_pd();
367 
368     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
369     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
370     case 3:
371       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
372       acolidx += 8;
373       aval += 8;
374       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
375       acolidx += 8;
376       aval += 8;
377       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
378       acolidx += 8;
379       aval += 8;
380       j += 3;
381       break;
382     case 2:
383       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
384       acolidx += 8;
385       aval += 8;
386       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
387       acolidx += 8;
388       aval += 8;
389       j += 2;
390       break;
391     case 1:
392       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
393       acolidx += 8;
394       aval += 8;
395       j += 1;
396       break;
397     }
398   #pragma novector
399     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
400       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
401       acolidx += 8;
402       aval += 8;
403       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
404       acolidx += 8;
405       aval += 8;
406       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
407       acolidx += 8;
408       aval += 8;
409       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
410       acolidx += 8;
411       aval += 8;
412     }
413 
414     vec_y = _mm512_add_pd(vec_y, vec_y2);
415     vec_y = _mm512_add_pd(vec_y, vec_y3);
416     vec_y = _mm512_add_pd(vec_y, vec_y4);
417     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
418       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
419       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
420     } else {
421       _mm512_storeu_pd(&y[8 * i], vec_y);
422     }
423   }
424 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
425   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
426   for (i = 0; i < totalslices; i++) { /* loop over full slices */
427     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
428     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
429 
430     /* last slice may have padding rows. Don't use vectorization. */
431     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
432       rows_left = A->rmap->n - 8 * i;
433       for (r = 0; r < rows_left; ++r) {
434         yval       = (MatScalar)0;
435         row        = 8 * i + r;
436         nnz_in_row = a->rlen[row];
437         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
438         y[row] = yval;
439       }
440       break;
441     }
442 
443     vec_y  = _mm256_setzero_pd();
444     vec_y2 = _mm256_setzero_pd();
445 
446   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
447   #pragma novector
448   #pragma unroll(2)
449     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
450       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
451       aval += 4;
452       acolidx += 4;
453       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
454       aval += 4;
455       acolidx += 4;
456     }
457 
458     _mm256_storeu_pd(y + i * 8, vec_y);
459     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
460   }
461 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
462   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
463   for (i = 0; i < totalslices; i++) { /* loop over full slices */
464     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
465     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
466 
467     vec_y  = _mm256_setzero_pd();
468     vec_y2 = _mm256_setzero_pd();
469 
470     /* last slice may have padding rows. Don't use vectorization. */
471     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
472       rows_left = A->rmap->n - 8 * i;
473       for (r = 0; r < rows_left; ++r) {
474         yval       = (MatScalar)0;
475         row        = 8 * i + r;
476         nnz_in_row = a->rlen[row];
477         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
478         y[row] = yval;
479       }
480       break;
481     }
482 
483   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
484   #pragma novector
485   #pragma unroll(2)
486     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
487       vec_vals  = _mm256_loadu_pd(aval);
488       vec_x_tmp = _mm_setzero_pd();
489       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
490       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
491       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
492       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
493       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
494       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
495       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
496       aval += 4;
497 
498       vec_vals  = _mm256_loadu_pd(aval);
499       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
500       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
501       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
502       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
503       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
504       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
505       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
506       aval += 4;
507     }
508 
509     _mm256_storeu_pd(y + i * 8, vec_y);
510     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
511   }
512 #else
513   PetscCall(PetscMalloc1(sliceheight, &sum));
514   for (i = 0; i < totalslices; i++) { /* loop over slices */
515     for (j = 0; j < sliceheight; j++) {
516       sum[j] = 0.0;
517       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
518     }
519     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
520       for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
521     } else {
522       for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
523     }
524   }
525   PetscCall(PetscFree(sum));
526 #endif
527 
528   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
529   PetscCall(VecRestoreArrayRead(xx, &x));
530   PetscCall(VecRestoreArray(yy, &y));
531   PetscFunctionReturn(PETSC_SUCCESS);
532 }
533 
534 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
535 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
536 {
537   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
538   PetscScalar       *y, *z;
539   const PetscScalar *x;
540   const MatScalar   *aval        = a->val;
541   PetscInt           totalslices = a->totalslices;
542   const PetscInt    *acolidx     = a->colidx;
543   PetscInt           i, j;
544 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
545   __m512d  vec_x, vec_y, vec_vals;
546   __m256i  vec_idx;
547   __mmask8 mask = 0;
548   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
549   __m256i  vec_idx2, vec_idx3, vec_idx4;
550 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
551   __m128d   vec_x_tmp;
552   __m256d   vec_x, vec_y, vec_y2, vec_vals;
553   MatScalar yval;
554   PetscInt  r, row, nnz_in_row;
555 #else
556   PetscInt     k, sliceheight = a->sliceheight;
557   PetscScalar *sum;
558 #endif
559 
560 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
561   #pragma disjoint(*x, *y, *aval)
562 #endif
563 
564   PetscFunctionBegin;
565   if (!a->nz) {
566     PetscCall(VecCopy(yy, zz));
567     PetscFunctionReturn(PETSC_SUCCESS);
568   }
569   PetscCall(VecGetArrayRead(xx, &x));
570   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
571 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
572   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
573   for (i = 0; i < totalslices; i++) { /* loop over slices */
574     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
575     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
576 
577     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
578       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
579       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
580     } else {
581       vec_y = _mm512_loadu_pd(&y[8 * i]);
582     }
583     vec_y2 = _mm512_setzero_pd();
584     vec_y3 = _mm512_setzero_pd();
585     vec_y4 = _mm512_setzero_pd();
586 
587     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
588     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
589     case 3:
590       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
591       acolidx += 8;
592       aval += 8;
593       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
594       acolidx += 8;
595       aval += 8;
596       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
597       acolidx += 8;
598       aval += 8;
599       j += 3;
600       break;
601     case 2:
602       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
603       acolidx += 8;
604       aval += 8;
605       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
606       acolidx += 8;
607       aval += 8;
608       j += 2;
609       break;
610     case 1:
611       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
612       acolidx += 8;
613       aval += 8;
614       j += 1;
615       break;
616     }
617   #pragma novector
618     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
619       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
620       acolidx += 8;
621       aval += 8;
622       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
623       acolidx += 8;
624       aval += 8;
625       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
626       acolidx += 8;
627       aval += 8;
628       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
629       acolidx += 8;
630       aval += 8;
631     }
632 
633     vec_y = _mm512_add_pd(vec_y, vec_y2);
634     vec_y = _mm512_add_pd(vec_y, vec_y3);
635     vec_y = _mm512_add_pd(vec_y, vec_y4);
636     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
637       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
638     } else {
639       _mm512_storeu_pd(&z[8 * i], vec_y);
640     }
641   }
642 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
643   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
644   for (i = 0; i < totalslices; i++) { /* loop over full slices */
645     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
646     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
647 
648     /* last slice may have padding rows. Don't use vectorization. */
649     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
650       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
651         row        = 8 * i + r;
652         yval       = (MatScalar)0.0;
653         nnz_in_row = a->rlen[row];
654         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
655         z[row] = y[row] + yval;
656       }
657       break;
658     }
659 
660     vec_y  = _mm256_loadu_pd(y + 8 * i);
661     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
662 
663     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
664     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
665       vec_vals  = _mm256_loadu_pd(aval);
666       vec_x_tmp = _mm_setzero_pd();
667       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
668       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
669       vec_x     = _mm256_setzero_pd();
670       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
671       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
672       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
673       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
674       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
675       aval += 4;
676 
677       vec_vals  = _mm256_loadu_pd(aval);
678       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
679       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
680       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
681       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
682       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
683       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
684       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
685       aval += 4;
686     }
687 
688     _mm256_storeu_pd(z + i * 8, vec_y);
689     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
690   }
691 #else
692   PetscCall(PetscMalloc1(sliceheight, &sum));
693   for (i = 0; i < totalslices; i++) { /* loop over slices */
694     for (j = 0; j < sliceheight; j++) {
695       sum[j] = 0.0;
696       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
697     }
698     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
699       for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
700     } else {
701       for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
702     }
703   }
704   PetscCall(PetscFree(sum));
705 #endif
706 
707   PetscCall(PetscLogFlops(2.0 * a->nz));
708   PetscCall(VecRestoreArrayRead(xx, &x));
709   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
710   PetscFunctionReturn(PETSC_SUCCESS);
711 }
712 
713 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
714 {
715   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
716   PetscScalar       *y;
717   const PetscScalar *x;
718   const MatScalar   *aval    = a->val;
719   const PetscInt    *acolidx = a->colidx;
720   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
721 
722 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
723   #pragma disjoint(*x, *y, *aval)
724 #endif
725 
726   PetscFunctionBegin;
727   if (A->symmetric == PETSC_BOOL3_TRUE) {
728     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
729     PetscFunctionReturn(PETSC_SUCCESS);
730   }
731   if (zz != yy) PetscCall(VecCopy(zz, yy));
732 
733   if (a->nz) {
734     PetscCall(VecGetArrayRead(xx, &x));
735     PetscCall(VecGetArray(yy, &y));
736     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
737       if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
738         for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
739           row        = sliceheight * i + r;
740           nnz_in_row = a->rlen[row];
741           for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
742         }
743         break;
744       }
745       for (r = 0; r < sliceheight; ++r)
746         for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
747     }
748     PetscCall(PetscLogFlops(2.0 * a->nz));
749     PetscCall(VecRestoreArrayRead(xx, &x));
750     PetscCall(VecRestoreArray(yy, &y));
751   }
752   PetscFunctionReturn(PETSC_SUCCESS);
753 }
754 
755 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
756 {
757   PetscFunctionBegin;
758   if (A->symmetric == PETSC_BOOL3_TRUE) {
759     PetscCall(MatMult_SeqSELL(A, xx, yy));
760   } else {
761     PetscCall(VecSet(yy, 0.0));
762     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
763   }
764   PetscFunctionReturn(PETSC_SUCCESS);
765 }
766 
767 /*
768      Checks for missing diagonals
769 */
770 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
771 {
772   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
773   PetscInt    *diag, i;
774 
775   PetscFunctionBegin;
776   *missing = PETSC_FALSE;
777   if (A->rmap->n > 0 && !a->colidx) {
778     *missing = PETSC_TRUE;
779     if (d) *d = 0;
780     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
781   } else {
782     diag = a->diag;
783     for (i = 0; i < A->rmap->n; i++) {
784       if (diag[i] == -1) {
785         *missing = PETSC_TRUE;
786         if (d) *d = i;
787         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
788         break;
789       }
790     }
791   }
792   PetscFunctionReturn(PETSC_SUCCESS);
793 }
794 
795 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
796 {
797   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
798   PetscInt     i, j, m = A->rmap->n, shift;
799 
800   PetscFunctionBegin;
801   if (!a->diag) {
802     PetscCall(PetscMalloc1(m, &a->diag));
803     a->free_diag = PETSC_TRUE;
804   }
805   for (i = 0; i < m; i++) {                                          /* loop over rows */
806     shift      = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
807     a->diag[i] = -1;
808     for (j = 0; j < a->rlen[i]; j++) {
809       if (a->colidx[shift + a->sliceheight * j] == i) {
810         a->diag[i] = shift + a->sliceheight * j;
811         break;
812       }
813     }
814   }
815   PetscFunctionReturn(PETSC_SUCCESS);
816 }
817 
818 /*
819   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
820 */
821 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
822 {
823   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
824   PetscInt     i, *diag, m = A->rmap->n;
825   MatScalar   *val = a->val;
826   PetscScalar *idiag, *mdiag;
827 
828   PetscFunctionBegin;
829   if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
830   PetscCall(MatMarkDiagonal_SeqSELL(A));
831   diag = a->diag;
832   if (!a->idiag) {
833     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
834     val = a->val;
835   }
836   mdiag = a->mdiag;
837   idiag = a->idiag;
838 
839   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
840     for (i = 0; i < m; i++) {
841       mdiag[i] = val[diag[i]];
842       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
843         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
844         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
845         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
846         A->factorerror_zeropivot_value = 0.0;
847         A->factorerror_zeropivot_row   = i;
848       }
849       idiag[i] = 1.0 / val[diag[i]];
850     }
851     PetscCall(PetscLogFlops(m));
852   } else {
853     for (i = 0; i < m; i++) {
854       mdiag[i] = val[diag[i]];
855       idiag[i] = omega / (fshift + val[diag[i]]);
856     }
857     PetscCall(PetscLogFlops(2.0 * m));
858   }
859   a->idiagvalid = PETSC_TRUE;
860   PetscFunctionReturn(PETSC_SUCCESS);
861 }
862 
863 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
864 {
865   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
866 
867   PetscFunctionBegin;
868   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
869   PetscCall(MatSeqSELLInvalidateDiagonal(A));
870   PetscFunctionReturn(PETSC_SUCCESS);
871 }
872 
873 PetscErrorCode MatDestroy_SeqSELL(Mat A)
874 {
875   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
876 
877   PetscFunctionBegin;
878   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
879   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
880   PetscCall(ISDestroy(&a->row));
881   PetscCall(ISDestroy(&a->col));
882   PetscCall(PetscFree(a->diag));
883   PetscCall(PetscFree(a->rlen));
884   PetscCall(PetscFree(a->sliidx));
885   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
886   PetscCall(PetscFree(a->solve_work));
887   PetscCall(ISDestroy(&a->icol));
888   PetscCall(PetscFree(a->saved_values));
889   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
890   PetscCall(PetscFree(A->data));
891 #if defined(PETSC_HAVE_CUPM)
892   PetscCall(PetscFree(a->chunk_slice_map));
893 #endif
894 
895   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
896   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
897   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
898   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
899   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
900   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
901   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
902 #if defined(PETSC_HAVE_CUDA)
903   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL));
904 #endif
905 #if defined(PETSC_HAVE_HIP)
906   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellhip_C", NULL));
907 #endif
908   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
909   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
910   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
911   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL));
912   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
913   PetscFunctionReturn(PETSC_SUCCESS);
914 }
915 
916 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
917 {
918   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
919 
920   PetscFunctionBegin;
921   switch (op) {
922   case MAT_ROW_ORIENTED:
923     a->roworiented = flg;
924     break;
925   case MAT_KEEP_NONZERO_PATTERN:
926     a->keepnonzeropattern = flg;
927     break;
928   case MAT_NEW_NONZERO_LOCATIONS:
929     a->nonew = (flg ? 0 : 1);
930     break;
931   case MAT_NEW_NONZERO_LOCATION_ERR:
932     a->nonew = (flg ? -1 : 0);
933     break;
934   case MAT_NEW_NONZERO_ALLOCATION_ERR:
935     a->nonew = (flg ? -2 : 0);
936     break;
937   case MAT_UNUSED_NONZERO_LOCATION_ERR:
938     a->nounused = (flg ? -1 : 0);
939     break;
940   default:
941     break;
942   }
943   PetscFunctionReturn(PETSC_SUCCESS);
944 }
945 
946 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
947 {
948   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
949   PetscInt     i, j, n, shift;
950   PetscScalar *x, zero = 0.0;
951 
952   PetscFunctionBegin;
953   PetscCall(VecGetLocalSize(v, &n));
954   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
955 
956   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
957     PetscInt *diag = a->diag;
958     PetscCall(VecGetArray(v, &x));
959     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
960     PetscCall(VecRestoreArray(v, &x));
961     PetscFunctionReturn(PETSC_SUCCESS);
962   }
963 
964   PetscCall(VecSet(v, zero));
965   PetscCall(VecGetArray(v, &x));
966   for (i = 0; i < n; i++) {                                     /* loop over rows */
967     shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
968     x[i]  = 0;
969     for (j = 0; j < a->rlen[i]; j++) {
970       if (a->colidx[shift + a->sliceheight * j] == i) {
971         x[i] = a->val[shift + a->sliceheight * j];
972         break;
973       }
974     }
975   }
976   PetscCall(VecRestoreArray(v, &x));
977   PetscFunctionReturn(PETSC_SUCCESS);
978 }
979 
980 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
981 {
982   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
983   const PetscScalar *l, *r;
984   PetscInt           i, j, m, n, row;
985 
986   PetscFunctionBegin;
987   if (ll) {
988     /* The local size is used so that VecMPI can be passed to this routine
989        by MatDiagonalScale_MPISELL */
990     PetscCall(VecGetLocalSize(ll, &m));
991     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
992     PetscCall(VecGetArrayRead(ll, &l));
993     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
994       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
995         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
996           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
997         }
998       } else {
999         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; }
1000       }
1001     }
1002     PetscCall(VecRestoreArrayRead(ll, &l));
1003     PetscCall(PetscLogFlops(a->nz));
1004   }
1005   if (rr) {
1006     PetscCall(VecGetLocalSize(rr, &n));
1007     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1008     PetscCall(VecGetArrayRead(rr, &r));
1009     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1010       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1011         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1012           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1013         }
1014       } else {
1015         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1016       }
1017     }
1018     PetscCall(VecRestoreArrayRead(rr, &r));
1019     PetscCall(PetscLogFlops(a->nz));
1020   }
1021   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1022 #if defined(PETSC_HAVE_CUPM)
1023   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1024 #endif
1025   PetscFunctionReturn(PETSC_SUCCESS);
1026 }
1027 
1028 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1029 {
1030   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1031   PetscInt    *cp, i, k, low, high, t, row, col, l;
1032   PetscInt     shift;
1033   MatScalar   *vp;
1034 
1035   PetscFunctionBegin;
1036   for (k = 0; k < m; k++) { /* loop over requested rows */
1037     row = im[k];
1038     if (row < 0) continue;
1039     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1040     shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1041     cp    = a->colidx + shift;                                        /* pointer to the row */
1042     vp    = a->val + shift;                                           /* pointer to the row */
1043     for (l = 0; l < n; l++) {                                         /* loop over requested columns */
1044       col = in[l];
1045       if (col < 0) continue;
1046       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1047       high = a->rlen[row];
1048       low  = 0; /* assume unsorted */
1049       while (high - low > 5) {
1050         t = (low + high) / 2;
1051         if (*(cp + a->sliceheight * t) > col) high = t;
1052         else low = t;
1053       }
1054       for (i = low; i < high; i++) {
1055         if (*(cp + a->sliceheight * i) > col) break;
1056         if (*(cp + a->sliceheight * i) == col) {
1057           *v++ = *(vp + a->sliceheight * i);
1058           goto finished;
1059         }
1060       }
1061       *v++ = 0.0;
1062     finished:;
1063     }
1064   }
1065   PetscFunctionReturn(PETSC_SUCCESS);
1066 }
1067 
1068 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1069 {
1070   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1071   PetscInt          i, j, m = A->rmap->n, shift;
1072   const char       *name;
1073   PetscViewerFormat format;
1074 
1075   PetscFunctionBegin;
1076   PetscCall(PetscViewerGetFormat(viewer, &format));
1077   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1078     PetscInt nofinalvalue = 0;
1079     /*
1080     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1081       nofinalvalue = 1;
1082     }
1083     */
1084     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1085     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1086     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1087 #if defined(PETSC_USE_COMPLEX)
1088     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1089 #else
1090     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1091 #endif
1092     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1093 
1094     for (i = 0; i < m; i++) {
1095       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1096       for (j = 0; j < a->rlen[i]; j++) {
1097 #if defined(PETSC_USE_COMPLEX)
1098         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1099 #else
1100         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1101 #endif
1102       }
1103     }
1104     /*
1105     if (nofinalvalue) {
1106 #if defined(PETSC_USE_COMPLEX)
1107       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1108 #else
1109       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1110 #endif
1111     }
1112     */
1113     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1114     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1115     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1116   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1117     PetscFunctionReturn(PETSC_SUCCESS);
1118   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1119     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1120     for (i = 0; i < m; i++) {
1121       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1122       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1123       for (j = 0; j < a->rlen[i]; j++) {
1124 #if defined(PETSC_USE_COMPLEX)
1125         if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1126           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1127         } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1128           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1129         } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1130           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1131         }
1132 #else
1133         if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1134 #endif
1135       }
1136       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1137     }
1138     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1139   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1140     PetscInt    cnt = 0, jcnt;
1141     PetscScalar value;
1142 #if defined(PETSC_USE_COMPLEX)
1143     PetscBool realonly = PETSC_TRUE;
1144     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1145       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1146         realonly = PETSC_FALSE;
1147         break;
1148       }
1149     }
1150 #endif
1151 
1152     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1153     for (i = 0; i < m; i++) {
1154       jcnt  = 0;
1155       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1156       for (j = 0; j < A->cmap->n; j++) {
1157         if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1158           value = a->val[cnt++];
1159           jcnt++;
1160         } else {
1161           value = 0.0;
1162         }
1163 #if defined(PETSC_USE_COMPLEX)
1164         if (realonly) {
1165           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1166         } else {
1167           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1168         }
1169 #else
1170         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1171 #endif
1172       }
1173       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1174     }
1175     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1176   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1177     PetscInt fshift = 1;
1178     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1179 #if defined(PETSC_USE_COMPLEX)
1180     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1181 #else
1182     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1183 #endif
1184     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1185     for (i = 0; i < m; i++) {
1186       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1187       for (j = 0; j < a->rlen[i]; j++) {
1188 #if defined(PETSC_USE_COMPLEX)
1189         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1190 #else
1191         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1192 #endif
1193       }
1194     }
1195     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1196   } else if (format == PETSC_VIEWER_NATIVE) {
1197     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1198       PetscInt row;
1199       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1200       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1201 #if defined(PETSC_USE_COMPLEX)
1202         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1203           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1204         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1205           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1206         } else {
1207           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1208         }
1209 #else
1210         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1211 #endif
1212       }
1213     }
1214   } else {
1215     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1216     if (A->factortype) {
1217       for (i = 0; i < m; i++) {
1218         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1219         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1220         /* L part */
1221         for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1222 #if defined(PETSC_USE_COMPLEX)
1223           if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1224             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1225           } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1226             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1227           } else {
1228             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1229           }
1230 #else
1231           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1232 #endif
1233         }
1234         /* diagonal */
1235         j = a->diag[i];
1236 #if defined(PETSC_USE_COMPLEX)
1237         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1238           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1239         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1240           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1241         } else {
1242           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1243         }
1244 #else
1245         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1 / a->val[j])));
1246 #endif
1247 
1248         /* U part */
1249         for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1250 #if defined(PETSC_USE_COMPLEX)
1251           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1252             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1253           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1254             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1255           } else {
1256             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1257           }
1258 #else
1259           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1260 #endif
1261         }
1262         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1263       }
1264     } else {
1265       for (i = 0; i < m; i++) {
1266         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1267         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1268         for (j = 0; j < a->rlen[i]; j++) {
1269 #if defined(PETSC_USE_COMPLEX)
1270           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1271             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1272           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1273             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1274           } else {
1275             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1276           }
1277 #else
1278           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1279 #endif
1280         }
1281         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1282       }
1283     }
1284     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1285   }
1286   PetscCall(PetscViewerFlush(viewer));
1287   PetscFunctionReturn(PETSC_SUCCESS);
1288 }
1289 
1290 #include <petscdraw.h>
1291 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1292 {
1293   Mat               A = (Mat)Aa;
1294   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1295   PetscInt          i, j, m = A->rmap->n, shift;
1296   int               color;
1297   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1298   PetscViewer       viewer;
1299   PetscViewerFormat format;
1300 
1301   PetscFunctionBegin;
1302   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1303   PetscCall(PetscViewerGetFormat(viewer, &format));
1304   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1305 
1306   /* loop over matrix elements drawing boxes */
1307 
1308   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1309     PetscDrawCollectiveBegin(draw);
1310     /* Blue for negative, Cyan for zero and  Red for positive */
1311     color = PETSC_DRAW_BLUE;
1312     for (i = 0; i < m; i++) {
1313       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1314       y_l   = m - i - 1.0;
1315       y_r   = y_l + 1.0;
1316       for (j = 0; j < a->rlen[i]; j++) {
1317         x_l = a->colidx[shift + a->sliceheight * j];
1318         x_r = x_l + 1.0;
1319         if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1320         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1321       }
1322     }
1323     color = PETSC_DRAW_CYAN;
1324     for (i = 0; i < m; i++) {
1325       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1326       y_l   = m - i - 1.0;
1327       y_r   = y_l + 1.0;
1328       for (j = 0; j < a->rlen[i]; j++) {
1329         x_l = a->colidx[shift + a->sliceheight * j];
1330         x_r = x_l + 1.0;
1331         if (a->val[shift + a->sliceheight * j] != 0.) continue;
1332         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1333       }
1334     }
1335     color = PETSC_DRAW_RED;
1336     for (i = 0; i < m; i++) {
1337       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1338       y_l   = m - i - 1.0;
1339       y_r   = y_l + 1.0;
1340       for (j = 0; j < a->rlen[i]; j++) {
1341         x_l = a->colidx[shift + a->sliceheight * j];
1342         x_r = x_l + 1.0;
1343         if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1344         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1345       }
1346     }
1347     PetscDrawCollectiveEnd(draw);
1348   } else {
1349     /* use contour shading to indicate magnitude of values */
1350     /* first determine max of all nonzero values */
1351     PetscReal minv = 0.0, maxv = 0.0;
1352     PetscInt  count = 0;
1353     PetscDraw popup;
1354     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1355       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1356     }
1357     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1358     PetscCall(PetscDrawGetPopup(draw, &popup));
1359     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1360 
1361     PetscDrawCollectiveBegin(draw);
1362     for (i = 0; i < m; i++) {
1363       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1364       y_l   = m - i - 1.0;
1365       y_r   = y_l + 1.0;
1366       for (j = 0; j < a->rlen[i]; j++) {
1367         x_l   = a->colidx[shift + a->sliceheight * j];
1368         x_r   = x_l + 1.0;
1369         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1370         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1371         count++;
1372       }
1373     }
1374     PetscDrawCollectiveEnd(draw);
1375   }
1376   PetscFunctionReturn(PETSC_SUCCESS);
1377 }
1378 
1379 #include <petscdraw.h>
1380 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1381 {
1382   PetscDraw draw;
1383   PetscReal xr, yr, xl, yl, h, w;
1384   PetscBool isnull;
1385 
1386   PetscFunctionBegin;
1387   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1388   PetscCall(PetscDrawIsNull(draw, &isnull));
1389   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1390 
1391   xr = A->cmap->n;
1392   yr = A->rmap->n;
1393   h  = yr / 10.0;
1394   w  = xr / 10.0;
1395   xr += w;
1396   yr += h;
1397   xl = -w;
1398   yl = -h;
1399   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1400   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1401   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1402   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1403   PetscCall(PetscDrawSave(draw));
1404   PetscFunctionReturn(PETSC_SUCCESS);
1405 }
1406 
1407 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1408 {
1409   PetscBool iascii, isbinary, isdraw;
1410 
1411   PetscFunctionBegin;
1412   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1413   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1414   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1415   if (iascii) {
1416     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1417   } else if (isbinary) {
1418     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1419   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1420   PetscFunctionReturn(PETSC_SUCCESS);
1421 }
1422 
1423 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1424 {
1425   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1426   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1427   MatScalar   *vp;
1428 #if defined(PETSC_HAVE_CUPM)
1429   PetscInt totalchunks = 0;
1430 #endif
1431 
1432   PetscFunctionBegin;
1433   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1434   /* To do: compress out the unused elements */
1435   PetscCall(MatMarkDiagonal_SeqSELL(A));
1436   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1437   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1438   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1439   a->nonzerorowcnt = 0;
1440   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1441   for (i = 0; i < a->totalslices; ++i) {
1442     shift = a->sliidx[i];                                                   /* starting index of the slice */
1443     cp    = PetscSafePointerPlusOffset(a->colidx, shift);                   /* pointer to the column indices of the slice */
1444     vp    = PetscSafePointerPlusOffset(a->val, shift);                      /* pointer to the nonzero values of the slice */
1445     for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1446       row  = a->sliceheight * i + row_in_slice;
1447       nrow = a->rlen[row]; /* number of nonzeros in row */
1448       /*
1449         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1450         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1451       */
1452       lastcol = 0;
1453       if (nrow > 0) { /* nonempty row */
1454         a->nonzerorowcnt++;
1455         lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1456       } else if (!row_in_slice) {                                 /* first row of the correct slice is empty */
1457         for (j = 1; j < a->sliceheight; j++) {
1458           if (a->rlen[a->sliceheight * i + j]) {
1459             lastcol = cp[j];
1460             break;
1461           }
1462         }
1463       } else {
1464         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1465       }
1466 
1467       for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1468         cp[a->sliceheight * k + row_in_slice] = lastcol;
1469         vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1470       }
1471     }
1472   }
1473 
1474   A->info.mallocs += a->reallocs;
1475   a->reallocs = 0;
1476 
1477   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1478 #if defined(PETSC_HAVE_CUPM)
1479   if (!a->chunksize && a->totalslices) {
1480     a->chunksize = 64;
1481     while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2;
1482     totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize;
1483   }
1484   if (totalchunks != a->totalchunks) {
1485     PetscCall(PetscFree(a->chunk_slice_map));
1486     PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map));
1487     a->totalchunks = totalchunks;
1488   }
1489   j = 0;
1490   for (i = 0; i < totalchunks; i++) {
1491     while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++;
1492     a->chunk_slice_map[i] = j;
1493   }
1494 #endif
1495   PetscFunctionReturn(PETSC_SUCCESS);
1496 }
1497 
1498 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1499 {
1500   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1501 
1502   PetscFunctionBegin;
1503   info->block_size   = 1.0;
1504   info->nz_allocated = a->maxallocmat;
1505   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1506   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1507   info->assemblies   = A->num_ass;
1508   info->mallocs      = A->info.mallocs;
1509   info->memory       = 0; /* REVIEW ME */
1510   if (A->factortype) {
1511     info->fill_ratio_given  = A->info.fill_ratio_given;
1512     info->fill_ratio_needed = A->info.fill_ratio_needed;
1513     info->factor_mallocs    = A->info.factor_mallocs;
1514   } else {
1515     info->fill_ratio_given  = 0;
1516     info->fill_ratio_needed = 0;
1517     info->factor_mallocs    = 0;
1518   }
1519   PetscFunctionReturn(PETSC_SUCCESS);
1520 }
1521 
1522 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1523 {
1524   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1525   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1526   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1527   MatScalar   *vp, value;
1528 #if defined(PETSC_HAVE_CUPM)
1529   PetscBool inserted = PETSC_FALSE;
1530   PetscInt  mul      = DEVICE_MEM_ALIGN / a->sliceheight;
1531 #endif
1532 
1533   PetscFunctionBegin;
1534   for (k = 0; k < m; k++) { /* loop over added rows */
1535     row = im[k];
1536     if (row < 0) continue;
1537     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1538     shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1539     cp    = a->colidx + shift;                                      /* pointer to the row */
1540     vp    = a->val + shift;                                         /* pointer to the row */
1541     nrow  = a->rlen[row];
1542     low   = 0;
1543     high  = nrow;
1544 
1545     for (l = 0; l < n; l++) { /* loop over added columns */
1546       col = in[l];
1547       if (col < 0) continue;
1548       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1549       if (a->roworiented) {
1550         value = v[l + k * n];
1551       } else {
1552         value = v[k + l * m];
1553       }
1554       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1555 
1556       /* search in this row for the specified column, i indicates the column to be set */
1557       if (col <= lastcol) low = 0;
1558       else high = nrow;
1559       lastcol = col;
1560       while (high - low > 5) {
1561         t = (low + high) / 2;
1562         if (*(cp + a->sliceheight * t) > col) high = t;
1563         else low = t;
1564       }
1565       for (i = low; i < high; i++) {
1566         if (*(cp + a->sliceheight * i) > col) break;
1567         if (*(cp + a->sliceheight * i) == col) {
1568           if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1569           else *(vp + a->sliceheight * i) = value;
1570 #if defined(PETSC_HAVE_CUPM)
1571           inserted = PETSC_TRUE;
1572 #endif
1573           low = i + 1;
1574           goto noinsert;
1575         }
1576       }
1577       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1578       if (nonew == 1) goto noinsert;
1579       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1580 #if defined(PETSC_HAVE_CUPM)
1581       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1582 #else
1583       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1584       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1585 #endif
1586       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1587       for (ii = nrow - 1; ii >= i; ii--) {
1588         *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1589         *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1590       }
1591       a->rlen[row]++;
1592       *(cp + a->sliceheight * i) = col;
1593       *(vp + a->sliceheight * i) = value;
1594       a->nz++;
1595 #if defined(PETSC_HAVE_CUPM)
1596       inserted = PETSC_TRUE;
1597 #endif
1598       low = i + 1;
1599       high++;
1600       nrow++;
1601     noinsert:;
1602     }
1603     a->rlen[row] = nrow;
1604   }
1605 #if defined(PETSC_HAVE_CUPM)
1606   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1607 #endif
1608   PetscFunctionReturn(PETSC_SUCCESS);
1609 }
1610 
1611 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1612 {
1613   PetscFunctionBegin;
1614   /* If the two matrices have the same copy implementation, use fast copy. */
1615   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1616     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1617     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1618 
1619     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1620     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1621   } else {
1622     PetscCall(MatCopy_Basic(A, B, str));
1623   }
1624   PetscFunctionReturn(PETSC_SUCCESS);
1625 }
1626 
1627 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1628 {
1629   PetscFunctionBegin;
1630   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1631   PetscFunctionReturn(PETSC_SUCCESS);
1632 }
1633 
1634 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1635 {
1636   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1637 
1638   PetscFunctionBegin;
1639   *array = a->val;
1640   PetscFunctionReturn(PETSC_SUCCESS);
1641 }
1642 
1643 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1644 {
1645   PetscFunctionBegin;
1646   PetscFunctionReturn(PETSC_SUCCESS);
1647 }
1648 
1649 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1650 {
1651   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1652   MatScalar   *aval   = a->val;
1653   PetscScalar  oalpha = alpha;
1654   PetscBLASInt one    = 1, size;
1655 
1656   PetscFunctionBegin;
1657   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1658   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1659   PetscCall(PetscLogFlops(a->nz));
1660   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1661 #if defined(PETSC_HAVE_CUPM)
1662   if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1663 #endif
1664   PetscFunctionReturn(PETSC_SUCCESS);
1665 }
1666 
1667 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1668 {
1669   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1670 
1671   PetscFunctionBegin;
1672   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1673   PetscCall(MatShift_Basic(Y, a));
1674   PetscFunctionReturn(PETSC_SUCCESS);
1675 }
1676 
1677 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1678 {
1679   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1680   PetscScalar       *x, sum, *t;
1681   const MatScalar   *idiag = NULL, *mdiag;
1682   const PetscScalar *b, *xb;
1683   PetscInt           n, m = A->rmap->n, i, j, shift;
1684   const PetscInt    *diag;
1685 
1686   PetscFunctionBegin;
1687   its = its * lits;
1688 
1689   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1690   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1691   a->fshift = fshift;
1692   a->omega  = omega;
1693 
1694   diag  = a->diag;
1695   t     = a->ssor_work;
1696   idiag = a->idiag;
1697   mdiag = a->mdiag;
1698 
1699   PetscCall(VecGetArray(xx, &x));
1700   PetscCall(VecGetArrayRead(bb, &b));
1701   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1702   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1703   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1704   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1705 
1706   if (flag & SOR_ZERO_INITIAL_GUESS) {
1707     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1708       for (i = 0; i < m; i++) {
1709         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1710         sum   = b[i];
1711         n     = (diag[i] - shift) / a->sliceheight;
1712         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1713         t[i] = sum;
1714         x[i] = sum * idiag[i];
1715       }
1716       xb = t;
1717       PetscCall(PetscLogFlops(a->nz));
1718     } else xb = b;
1719     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1720       for (i = m - 1; i >= 0; i--) {
1721         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1722         sum   = xb[i];
1723         n     = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1724         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1725         if (xb == b) {
1726           x[i] = sum * idiag[i];
1727         } else {
1728           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1729         }
1730       }
1731       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1732     }
1733     its--;
1734   }
1735   while (its--) {
1736     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1737       for (i = 0; i < m; i++) {
1738         /* lower */
1739         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1740         sum   = b[i];
1741         n     = (diag[i] - shift) / a->sliceheight;
1742         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1743         t[i] = sum; /* save application of the lower-triangular part */
1744         /* upper */
1745         n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1746         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1747         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1748       }
1749       xb = t;
1750       PetscCall(PetscLogFlops(2.0 * a->nz));
1751     } else xb = b;
1752     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1753       for (i = m - 1; i >= 0; i--) {
1754         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1755         sum   = xb[i];
1756         if (xb == b) {
1757           /* whole matrix (no checkpointing available) */
1758           n = a->rlen[i];
1759           for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1760           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1761         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1762           n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1763           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1764           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1765         }
1766       }
1767       if (xb == b) {
1768         PetscCall(PetscLogFlops(2.0 * a->nz));
1769       } else {
1770         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1771       }
1772     }
1773   }
1774   PetscCall(VecRestoreArray(xx, &x));
1775   PetscCall(VecRestoreArrayRead(bb, &b));
1776   PetscFunctionReturn(PETSC_SUCCESS);
1777 }
1778 
1779 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1780                                        MatGetRow_SeqSELL,
1781                                        MatRestoreRow_SeqSELL,
1782                                        MatMult_SeqSELL,
1783                                        /* 4*/ MatMultAdd_SeqSELL,
1784                                        MatMultTranspose_SeqSELL,
1785                                        MatMultTransposeAdd_SeqSELL,
1786                                        NULL,
1787                                        NULL,
1788                                        NULL,
1789                                        /* 10*/ NULL,
1790                                        NULL,
1791                                        NULL,
1792                                        MatSOR_SeqSELL,
1793                                        NULL,
1794                                        /* 15*/ MatGetInfo_SeqSELL,
1795                                        MatEqual_SeqSELL,
1796                                        MatGetDiagonal_SeqSELL,
1797                                        MatDiagonalScale_SeqSELL,
1798                                        NULL,
1799                                        /* 20*/ NULL,
1800                                        MatAssemblyEnd_SeqSELL,
1801                                        MatSetOption_SeqSELL,
1802                                        MatZeroEntries_SeqSELL,
1803                                        /* 24*/ NULL,
1804                                        NULL,
1805                                        NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        /* 29*/ MatSetUp_SeqSELL,
1809                                        NULL,
1810                                        NULL,
1811                                        NULL,
1812                                        NULL,
1813                                        /* 34*/ MatDuplicate_SeqSELL,
1814                                        NULL,
1815                                        NULL,
1816                                        NULL,
1817                                        NULL,
1818                                        /* 39*/ NULL,
1819                                        NULL,
1820                                        NULL,
1821                                        MatGetValues_SeqSELL,
1822                                        MatCopy_SeqSELL,
1823                                        /* 44*/ NULL,
1824                                        MatScale_SeqSELL,
1825                                        MatShift_SeqSELL,
1826                                        NULL,
1827                                        NULL,
1828                                        /* 49*/ NULL,
1829                                        NULL,
1830                                        NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1834                                        NULL,
1835                                        NULL,
1836                                        NULL,
1837                                        NULL,
1838                                        /* 59*/ NULL,
1839                                        MatDestroy_SeqSELL,
1840                                        MatView_SeqSELL,
1841                                        NULL,
1842                                        NULL,
1843                                        /* 64*/ NULL,
1844                                        NULL,
1845                                        NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        /* 69*/ NULL,
1849                                        NULL,
1850                                        NULL,
1851                                        NULL,
1852                                        NULL,
1853                                        /* 74*/ NULL,
1854                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1855                                        NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        /* 79*/ NULL,
1859                                        NULL,
1860                                        NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        /* 84*/ NULL,
1864                                        NULL,
1865                                        NULL,
1866                                        NULL,
1867                                        NULL,
1868                                        /* 89*/ NULL,
1869                                        NULL,
1870                                        NULL,
1871                                        NULL,
1872                                        NULL,
1873                                        /* 94*/ NULL,
1874                                        NULL,
1875                                        NULL,
1876                                        NULL,
1877                                        NULL,
1878                                        /* 99*/ NULL,
1879                                        NULL,
1880                                        NULL,
1881                                        MatConjugate_SeqSELL,
1882                                        NULL,
1883                                        /*104*/ NULL,
1884                                        NULL,
1885                                        NULL,
1886                                        NULL,
1887                                        NULL,
1888                                        /*109*/ NULL,
1889                                        NULL,
1890                                        NULL,
1891                                        NULL,
1892                                        MatMissingDiagonal_SeqSELL,
1893                                        /*114*/ NULL,
1894                                        NULL,
1895                                        NULL,
1896                                        NULL,
1897                                        NULL,
1898                                        /*119*/ NULL,
1899                                        NULL,
1900                                        NULL,
1901                                        NULL,
1902                                        NULL,
1903                                        /*124*/ NULL,
1904                                        NULL,
1905                                        NULL,
1906                                        NULL,
1907                                        NULL,
1908                                        /*129*/ NULL,
1909                                        NULL,
1910                                        NULL,
1911                                        NULL,
1912                                        NULL,
1913                                        /*134*/ NULL,
1914                                        NULL,
1915                                        NULL,
1916                                        NULL,
1917                                        NULL,
1918                                        /*139*/ NULL,
1919                                        NULL,
1920                                        NULL,
1921                                        MatFDColoringSetUp_SeqXAIJ,
1922                                        NULL,
1923                                        /*144*/ NULL,
1924                                        NULL,
1925                                        NULL,
1926                                        NULL,
1927                                        NULL,
1928                                        NULL,
1929                                        /*150*/ NULL,
1930                                        NULL,
1931                                        NULL,
1932                                        NULL,
1933                                        NULL,
1934                                        /*155*/ NULL,
1935                                        NULL};
1936 
1937 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1938 {
1939   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1940 
1941   PetscFunctionBegin;
1942   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1943 
1944   /* allocate space for values if not already there */
1945   if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1946 
1947   /* copy values over */
1948   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1949   PetscFunctionReturn(PETSC_SUCCESS);
1950 }
1951 
1952 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1953 {
1954   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1955 
1956   PetscFunctionBegin;
1957   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1958   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1959   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1960   PetscFunctionReturn(PETSC_SUCCESS);
1961 }
1962 
1963 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
1964 {
1965   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1966 
1967   PetscFunctionBegin;
1968   if (a->totalslices && a->sliidx[a->totalslices]) {
1969     *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
1970   } else {
1971     *ratio = 0.0;
1972   }
1973   PetscFunctionReturn(PETSC_SUCCESS);
1974 }
1975 
1976 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
1977 {
1978   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1979   PetscInt     i, current_slicewidth;
1980 
1981   PetscFunctionBegin;
1982   *slicewidth = 0;
1983   for (i = 0; i < a->totalslices; i++) {
1984     current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
1985     if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
1986   }
1987   PetscFunctionReturn(PETSC_SUCCESS);
1988 }
1989 
1990 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
1991 {
1992   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1993 
1994   PetscFunctionBegin;
1995   *slicewidth = 0;
1996   if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; }
1997   PetscFunctionReturn(PETSC_SUCCESS);
1998 }
1999 
2000 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance)
2001 {
2002   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2003   PetscReal    mean;
2004   PetscInt     i, totalslices = a->totalslices, *sliidx = a->sliidx;
2005 
2006   PetscFunctionBegin;
2007   *variance = 0;
2008   if (totalslices) {
2009     mean = (PetscReal)sliidx[totalslices] / totalslices;
2010     for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; }
2011   }
2012   PetscFunctionReturn(PETSC_SUCCESS);
2013 }
2014 
2015 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2016 {
2017   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2018 
2019   PetscFunctionBegin;
2020   if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2021   PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2022   a->sliceheight = sliceheight;
2023 #if defined(PETSC_HAVE_CUPM)
2024   PetscCheck(PetscMax(DEVICE_MEM_ALIGN, sliceheight) % PetscMin(DEVICE_MEM_ALIGN, sliceheight) == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The slice height is not compatible with DEVICE_MEM_ALIGN (one must be divisible by the other) %" PetscInt_FMT, sliceheight);
2025 #endif
2026   PetscFunctionReturn(PETSC_SUCCESS);
2027 }
2028 
2029 /*@
2030   MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2031 
2032   Not Collective
2033 
2034   Input Parameter:
2035 . A - a MATSEQSELL matrix
2036 
2037   Output Parameter:
2038 . ratio - ratio of number of padded zeros to number of allocated elements
2039 
2040   Level: intermediate
2041 
2042 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2043 @*/
2044 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2045 {
2046   PetscFunctionBegin;
2047   PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2048   PetscFunctionReturn(PETSC_SUCCESS);
2049 }
2050 
2051 /*@
2052   MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2053 
2054   Not Collective
2055 
2056   Input Parameter:
2057 . A - a MATSEQSELL matrix
2058 
2059   Output Parameter:
2060 . slicewidth - maximum slice width
2061 
2062   Level: intermediate
2063 
2064 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2065 @*/
2066 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2067 {
2068   PetscFunctionBegin;
2069   PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2070   PetscFunctionReturn(PETSC_SUCCESS);
2071 }
2072 
2073 /*@
2074   MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2075 
2076   Not Collective
2077 
2078   Input Parameter:
2079 . A - a MATSEQSELL matrix
2080 
2081   Output Parameter:
2082 . slicewidth - average slice width
2083 
2084   Level: intermediate
2085 
2086 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()`
2087 @*/
2088 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2089 {
2090   PetscFunctionBegin;
2091   PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2092   PetscFunctionReturn(PETSC_SUCCESS);
2093 }
2094 
2095 /*@
2096   MatSeqSELLSetSliceHeight - sets the slice height.
2097 
2098   Not Collective
2099 
2100   Input Parameters:
2101 + A           - a MATSEQSELL matrix
2102 - sliceheight - slice height
2103 
2104   Notes:
2105   You cannot change the slice height once it have been set.
2106 
2107   The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2108 
2109   Level: intermediate
2110 
2111 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()`
2112 @*/
2113 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2114 {
2115   PetscFunctionBegin;
2116   PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2117   PetscFunctionReturn(PETSC_SUCCESS);
2118 }
2119 
2120 /*@
2121   MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2122 
2123   Not Collective
2124 
2125   Input Parameter:
2126 . A - a MATSEQSELL matrix
2127 
2128   Output Parameter:
2129 . variance - variance of the slice size
2130 
2131   Level: intermediate
2132 
2133 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()`
2134 @*/
2135 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2136 {
2137   PetscFunctionBegin;
2138   PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2139   PetscFunctionReturn(PETSC_SUCCESS);
2140 }
2141 
2142 #if defined(PETSC_HAVE_CUDA)
2143 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2144 #endif
2145 #if defined(PETSC_HAVE_HIP)
2146 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat);
2147 #endif
2148 
2149 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2150 {
2151   Mat_SeqSELL *b;
2152   PetscMPIInt  size;
2153 
2154   PetscFunctionBegin;
2155   PetscCall(PetscCitationsRegister(citation, &cited));
2156   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2157   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2158 
2159   PetscCall(PetscNew(&b));
2160 
2161   B->data   = (void *)b;
2162   B->ops[0] = MatOps_Values;
2163 
2164   b->row                = NULL;
2165   b->col                = NULL;
2166   b->icol               = NULL;
2167   b->reallocs           = 0;
2168   b->ignorezeroentries  = PETSC_FALSE;
2169   b->roworiented        = PETSC_TRUE;
2170   b->nonew              = 0;
2171   b->diag               = NULL;
2172   b->solve_work         = NULL;
2173   B->spptr              = NULL;
2174   b->saved_values       = NULL;
2175   b->idiag              = NULL;
2176   b->mdiag              = NULL;
2177   b->ssor_work          = NULL;
2178   b->omega              = 1.0;
2179   b->fshift             = 0.0;
2180   b->idiagvalid         = PETSC_FALSE;
2181   b->keepnonzeropattern = PETSC_FALSE;
2182   b->sliceheight        = 0;
2183 
2184   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2185   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2186   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2187   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2188   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2189   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2190   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
2191 #if defined(PETSC_HAVE_CUDA)
2192   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA));
2193 #endif
2194 #if defined(PETSC_HAVE_HIP)
2195   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellhip_C", MatConvert_SeqSELL_SeqSELLHIP));
2196 #endif
2197   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2198   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2199   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2200   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL));
2201   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2202 
2203   PetscObjectOptionsBegin((PetscObject)B);
2204   {
2205     PetscInt  newsh = -1;
2206     PetscBool flg;
2207 #if defined(PETSC_HAVE_CUPM)
2208     PetscInt chunksize = 0;
2209 #endif
2210 
2211     PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2212     if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); }
2213 #if defined(PETSC_HAVE_CUPM)
2214     PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA/HIP kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg));
2215     if (flg) {
2216       PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize);
2217       b->chunksize = chunksize;
2218     }
2219 #endif
2220   }
2221   PetscOptionsEnd();
2222   PetscFunctionReturn(PETSC_SUCCESS);
2223 }
2224 
2225 /*
2226  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2227  */
2228 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2229 {
2230   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2231   PetscInt     i, m                           = A->rmap->n;
2232   PetscInt     totalslices = a->totalslices;
2233 
2234   PetscFunctionBegin;
2235   C->factortype = A->factortype;
2236   c->row        = NULL;
2237   c->col        = NULL;
2238   c->icol       = NULL;
2239   c->reallocs   = 0;
2240   C->assembled  = PETSC_TRUE;
2241 
2242   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2243   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2244 
2245   c->sliceheight = a->sliceheight;
2246   PetscCall(PetscMalloc1(c->sliceheight * totalslices, &c->rlen));
2247   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2248 
2249   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2250   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2251 
2252   /* allocate the matrix space */
2253   if (mallocmatspace) {
2254     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2255 
2256     c->singlemalloc = PETSC_TRUE;
2257 
2258     if (m > 0) {
2259       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2260       if (cpvalues == MAT_COPY_VALUES) {
2261         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2262       } else {
2263         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2264       }
2265     }
2266   }
2267 
2268   c->ignorezeroentries = a->ignorezeroentries;
2269   c->roworiented       = a->roworiented;
2270   c->nonew             = a->nonew;
2271   if (a->diag) {
2272     PetscCall(PetscMalloc1(m, &c->diag));
2273     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2274   } else c->diag = NULL;
2275 
2276   c->solve_work         = NULL;
2277   c->saved_values       = NULL;
2278   c->idiag              = NULL;
2279   c->ssor_work          = NULL;
2280   c->keepnonzeropattern = a->keepnonzeropattern;
2281   c->free_val           = PETSC_TRUE;
2282   c->free_colidx        = PETSC_TRUE;
2283 
2284   c->maxallocmat  = a->maxallocmat;
2285   c->maxallocrow  = a->maxallocrow;
2286   c->rlenmax      = a->rlenmax;
2287   c->nz           = a->nz;
2288   C->preallocated = PETSC_TRUE;
2289 
2290   c->nonzerorowcnt = a->nonzerorowcnt;
2291   C->nonzerostate  = A->nonzerostate;
2292 
2293   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2294   PetscFunctionReturn(PETSC_SUCCESS);
2295 }
2296 
2297 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2298 {
2299   PetscFunctionBegin;
2300   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2301   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2302   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2303   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2304   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2305   PetscFunctionReturn(PETSC_SUCCESS);
2306 }
2307 
2308 /*MC
2309    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2310    based on the sliced Ellpack format, {cite}`zhangellpack2018`
2311 
2312    Options Database Key:
2313 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2314 
2315    Level: beginner
2316 
2317 .seealso: `Mat`, `MatCreateSeqSELL()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2318 M*/
2319 
2320 /*MC
2321    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018`
2322 
2323    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2324    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2325   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2326   for communicators controlling multiple processes.  It is recommended that you call both of
2327   the above preallocation routines for simplicity.
2328 
2329    Options Database Key:
2330 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2331 
2332   Level: beginner
2333 
2334   Notes:
2335   This format is only supported for real scalars, double precision, and 32-bit indices (the defaults).
2336 
2337   It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2338   non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2339 
2340   Developer Notes:
2341   On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2342 
2343   The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2344 .vb
2345                             (2 0  3 4)
2346    Consider the matrix A =  (5 0  6 0)
2347                             (0 0  7 8)
2348                             (0 0  9 9)
2349 
2350    symbolically the Ellpack format can be written as
2351 
2352         (2 3 4 |)           (0 2 3 |)
2353    v =  (5 6 0 |)  colidx = (0 2 2 |)
2354         --------            ---------
2355         (7 8 |)             (2 3 |)
2356         (9 9 |)             (2 3 |)
2357 
2358     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2359     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2360     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2361 
2362     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2363 
2364 .ve
2365 
2366     See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance.
2367 
2368 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSELL()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2369 M*/
2370 
2371 /*@
2372   MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2373 
2374   Collective
2375 
2376   Input Parameters:
2377 + comm    - MPI communicator, set to `PETSC_COMM_SELF`
2378 . m       - number of rows
2379 . n       - number of columns
2380 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2381 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2382 
2383   Output Parameter:
2384 . A - the matrix
2385 
2386   Level: intermediate
2387 
2388   Notes:
2389   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2390   MatXXXXSetPreallocation() paradigm instead of this routine directly.
2391   [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2392 
2393   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2394   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2395   allocation.
2396 
2397 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL`
2398  @*/
2399 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2400 {
2401   PetscFunctionBegin;
2402   PetscCall(MatCreate(comm, A));
2403   PetscCall(MatSetSizes(*A, m, n, m, n));
2404   PetscCall(MatSetType(*A, MATSEQSELL));
2405   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2406   PetscFunctionReturn(PETSC_SUCCESS);
2407 }
2408 
2409 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2410 {
2411   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2412   PetscInt     totalslices = a->totalslices;
2413 
2414   PetscFunctionBegin;
2415   /* If the  matrix dimensions are not equal,or no of nonzeros */
2416   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2417     *flg = PETSC_FALSE;
2418     PetscFunctionReturn(PETSC_SUCCESS);
2419   }
2420   /* if the a->colidx are the same */
2421   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2422   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2423   /* if a->val are the same */
2424   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2425   PetscFunctionReturn(PETSC_SUCCESS);
2426 }
2427 
2428 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2429 {
2430   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2431 
2432   PetscFunctionBegin;
2433   a->idiagvalid = PETSC_FALSE;
2434   PetscFunctionReturn(PETSC_SUCCESS);
2435 }
2436 
2437 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2438 {
2439 #if defined(PETSC_USE_COMPLEX)
2440   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2441   PetscInt     i;
2442   PetscScalar *val = a->val;
2443 
2444   PetscFunctionBegin;
2445   for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); }
2446   #if defined(PETSC_HAVE_CUPM)
2447   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2448   #endif
2449 #else
2450   PetscFunctionBegin;
2451 #endif
2452   PetscFunctionReturn(PETSC_SUCCESS);
2453 }
2454