xref: /petsc/src/mat/impls/sell/seq/sell.c (revision e91c04dfc8a52dee1965211bb1cc8e5bf775178f)
1 /*
2   Defines the basic matrix operations for the SELL matrix storage format.
3 */
4 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
5 #include <petscblaslapack.h>
6 #include <petsc/private/kernels/blocktranspose.h>
7 
8 static PetscBool  cited      = PETSC_FALSE;
9 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
10                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
11                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
12                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
13                                " year = 2018\n"
14                                "}\n";
15 
16 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
17 
18   #include <immintrin.h>
19 
20   #if !defined(_MM_SCALE_8)
21     #define _MM_SCALE_8 8
22   #endif
23 
24   #if defined(__AVX512F__)
25     /* these do not work
26    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
27    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
28   */
29     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
30       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
31       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
32       vec_vals = _mm512_loadu_pd(aval); \
33       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
34       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
35   #elif defined(__AVX2__) && defined(__FMA__)
36     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
37       vec_vals = _mm256_loadu_pd(aval); \
38       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
39       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
40       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
41   #endif
42 #endif /* PETSC_HAVE_IMMINTRIN_H */
43 
44 /*@
45   MatSeqSELLSetPreallocation - For good matrix assembly performance
46   the user should preallocate the matrix storage by setting the parameter `nz`
47   (or the array `nnz`).
48 
49   Collective
50 
51   Input Parameters:
52 + B       - The `MATSEQSELL` matrix
53 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
54 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
55 
56   Level: intermediate
57 
58   Notes:
59   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
60   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
61   allocation.
62 
63   You can call `MatGetInfo()` to get information on how effective the preallocation was;
64   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
65   You can also run with the option `-info` and look for messages with the string
66   malloc in them to see if additional memory allocation was needed.
67 
68   Developer Notes:
69   Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
70   entries or columns indices.
71 
72   The maximum number of nonzeos in any row should be as accurate as possible.
73   If it is underestimated, you will get bad performance due to reallocation
74   (`MatSeqXSELLReallocateSELL()`).
75 
76 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
77  @*/
78 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
79 {
80   PetscFunctionBegin;
81   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
82   PetscValidType(B, 1);
83   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
84   PetscFunctionReturn(PETSC_SUCCESS);
85 }
86 
87 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
88 {
89   Mat_SeqSELL *b;
90   PetscInt     i, j, totalslices;
91 #if defined(PETSC_HAVE_CUPM)
92   PetscInt rlenmax = 0;
93 #endif
94   PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
95 
96   PetscFunctionBegin;
97   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
98   if (maxallocrow == MAT_SKIP_ALLOCATION) {
99     skipallocation = PETSC_TRUE;
100     maxallocrow    = 0;
101   }
102 
103   PetscCall(PetscLayoutSetUp(B->rmap));
104   PetscCall(PetscLayoutSetUp(B->cmap));
105 
106   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
107   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
108   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
109   if (rlen) {
110     for (i = 0; i < B->rmap->n; i++) {
111       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
112       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
113     }
114   }
115 
116   B->preallocated = PETSC_TRUE;
117 
118   b = (Mat_SeqSELL *)B->data;
119 
120   if (!b->sliceheight) { /* not set yet */
121 #if defined(PETSC_HAVE_CUPM)
122     b->sliceheight = 16;
123 #else
124     b->sliceheight = 8;
125 #endif
126   }
127   totalslices    = PetscCeilInt(B->rmap->n, b->sliceheight);
128   b->totalslices = totalslices;
129   if (!skipallocation) {
130     if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
131 
132     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
133       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
134     }
135     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
136       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
137       else if (maxallocrow < 0) maxallocrow = 1;
138 #if defined(PETSC_HAVE_CUPM)
139       rlenmax = maxallocrow;
140       /* Pad the slice to DEVICE_MEM_ALIGN */
141       while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
142 #endif
143       for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
144     } else {
145 #if defined(PETSC_HAVE_CUPM)
146       PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
147 #endif
148       maxallocrow  = 0;
149       b->sliidx[0] = 0;
150       for (i = 1; i < totalslices; i++) {
151         b->sliidx[i] = 0;
152         for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); }
153 #if defined(PETSC_HAVE_CUPM)
154         if (mul != 0) { /* Pad the slice to DEVICE_MEM_ALIGN if sliceheight < DEVICE_MEM_ALIGN */
155           rlenmax      = PetscMax(b->sliidx[i], rlenmax);
156           b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
157         }
158 #endif
159         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
160         PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
161       }
162       /* last slice */
163       b->sliidx[totalslices] = 0;
164       for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
165 #if defined(PETSC_HAVE_CUPM)
166       if (mul != 0) {
167         rlenmax                = PetscMax(b->sliidx[i], rlenmax);
168         b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
169       }
170 #endif
171       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
172       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
173     }
174 
175     /* allocate space for val, colidx, rlen */
176     /* FIXME: should B's old memory be unlogged? */
177     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
178     /* FIXME: assuming an element of the bit array takes 8 bits */
179     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
180     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
181     PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
182 
183     b->singlemalloc = PETSC_TRUE;
184     b->free_val     = PETSC_TRUE;
185     b->free_colidx  = PETSC_TRUE;
186   } else {
187     b->free_val    = PETSC_FALSE;
188     b->free_colidx = PETSC_FALSE;
189   }
190 
191   b->nz          = 0;
192   b->maxallocrow = maxallocrow;
193 #if defined(PETSC_HAVE_CUPM)
194   b->rlenmax = rlenmax;
195 #else
196   b->rlenmax = maxallocrow;
197 #endif
198   b->maxallocmat      = b->sliidx[totalslices];
199   B->info.nz_unneeded = (double)b->maxallocmat;
200   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
201   PetscFunctionReturn(PETSC_SUCCESS);
202 }
203 
204 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
205 {
206   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
207   PetscInt     shift;
208 
209   PetscFunctionBegin;
210   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
211   if (nz) *nz = a->rlen[row];
212   shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
213   if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); }
214   if (idx) {
215     PetscInt j;
216     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
217     *idx = a->getrowcols;
218   }
219   if (v) {
220     PetscInt j;
221     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
222     *v = a->getrowvals;
223   }
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
227 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
228 {
229   PetscFunctionBegin;
230   PetscFunctionReturn(PETSC_SUCCESS);
231 }
232 
233 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
234 {
235   Mat          B;
236   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
237   PetscInt     i;
238 
239   PetscFunctionBegin;
240   if (reuse == MAT_REUSE_MATRIX) {
241     B = *newmat;
242     PetscCall(MatZeroEntries(B));
243   } else {
244     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
245     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
246     PetscCall(MatSetType(B, MATSEQAIJ));
247     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
248   }
249 
250   for (i = 0; i < A->rmap->n; i++) {
251     PetscInt     nz = 0, *cols = NULL;
252     PetscScalar *vals = NULL;
253 
254     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
255     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
256     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
257   }
258 
259   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
260   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
261   B->rmap->bs = A->rmap->bs;
262 
263   if (reuse == MAT_INPLACE_MATRIX) {
264     PetscCall(MatHeaderReplace(A, &B));
265   } else {
266     *newmat = B;
267   }
268   PetscFunctionReturn(PETSC_SUCCESS);
269 }
270 
271 #include <../src/mat/impls/aij/seq/aij.h>
272 
273 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
274 {
275   Mat                B;
276   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
277   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
278   const PetscInt    *cols;
279   const PetscScalar *vals;
280 
281   PetscFunctionBegin;
282   if (reuse == MAT_REUSE_MATRIX) {
283     B = *newmat;
284   } else {
285     if (PetscDefined(USE_DEBUG) || !a->ilen) {
286       PetscCall(PetscMalloc1(m, &rowlengths));
287       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
288     }
289     if (PetscDefined(USE_DEBUG) && a->ilen) {
290       PetscBool eq;
291       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
292       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
293       PetscCall(PetscFree(rowlengths));
294       rowlengths = a->ilen;
295     } else if (a->ilen) rowlengths = a->ilen;
296     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
297     PetscCall(MatSetSizes(B, m, n, m, n));
298     PetscCall(MatSetType(B, MATSEQSELL));
299     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
300     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
301   }
302 
303   for (row = 0; row < m; row++) {
304     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
305     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
306     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
307   }
308   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
309   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
310   B->rmap->bs = A->rmap->bs;
311 
312   if (reuse == MAT_INPLACE_MATRIX) {
313     PetscCall(MatHeaderReplace(A, &B));
314   } else {
315     *newmat = B;
316   }
317   PetscFunctionReturn(PETSC_SUCCESS);
318 }
319 
320 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
321 {
322   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
323   PetscScalar       *y;
324   const PetscScalar *x;
325   const MatScalar   *aval        = a->val;
326   PetscInt           totalslices = a->totalslices;
327   const PetscInt    *acolidx     = a->colidx;
328   PetscInt           i, j;
329 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
330   __m512d  vec_x, vec_y, vec_vals;
331   __m256i  vec_idx;
332   __mmask8 mask;
333   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
334   __m256i  vec_idx2, vec_idx3, vec_idx4;
335 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
336   __m128i   vec_idx;
337   __m256d   vec_x, vec_y, vec_y2, vec_vals;
338   MatScalar yval;
339   PetscInt  r, rows_left, row, nnz_in_row;
340 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
341   __m128d   vec_x_tmp;
342   __m256d   vec_x, vec_y, vec_y2, vec_vals;
343   MatScalar yval;
344   PetscInt  r, rows_left, row, nnz_in_row;
345 #else
346   PetscInt     k, sliceheight = a->sliceheight;
347   PetscScalar *sum;
348 #endif
349 
350 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
351   #pragma disjoint(*x, *y, *aval)
352 #endif
353 
354   PetscFunctionBegin;
355   PetscCall(VecGetArrayRead(xx, &x));
356   PetscCall(VecGetArray(yy, &y));
357 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
358   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
359   for (i = 0; i < totalslices; i++) { /* loop over slices */
360     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
361     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
362 
363     vec_y  = _mm512_setzero_pd();
364     vec_y2 = _mm512_setzero_pd();
365     vec_y3 = _mm512_setzero_pd();
366     vec_y4 = _mm512_setzero_pd();
367 
368     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
369     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
370     case 3:
371       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
372       acolidx += 8;
373       aval += 8;
374       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
375       acolidx += 8;
376       aval += 8;
377       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
378       acolidx += 8;
379       aval += 8;
380       j += 3;
381       break;
382     case 2:
383       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
384       acolidx += 8;
385       aval += 8;
386       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
387       acolidx += 8;
388       aval += 8;
389       j += 2;
390       break;
391     case 1:
392       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
393       acolidx += 8;
394       aval += 8;
395       j += 1;
396       break;
397     }
398   #pragma novector
399     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
400       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
401       acolidx += 8;
402       aval += 8;
403       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
404       acolidx += 8;
405       aval += 8;
406       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
407       acolidx += 8;
408       aval += 8;
409       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
410       acolidx += 8;
411       aval += 8;
412     }
413 
414     vec_y = _mm512_add_pd(vec_y, vec_y2);
415     vec_y = _mm512_add_pd(vec_y, vec_y3);
416     vec_y = _mm512_add_pd(vec_y, vec_y4);
417     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
418       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
419       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
420     } else {
421       _mm512_storeu_pd(&y[8 * i], vec_y);
422     }
423   }
424 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
425   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
426   for (i = 0; i < totalslices; i++) { /* loop over full slices */
427     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
428     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
429 
430     /* last slice may have padding rows. Don't use vectorization. */
431     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
432       rows_left = A->rmap->n - 8 * i;
433       for (r = 0; r < rows_left; ++r) {
434         yval       = (MatScalar)0;
435         row        = 8 * i + r;
436         nnz_in_row = a->rlen[row];
437         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
438         y[row] = yval;
439       }
440       break;
441     }
442 
443     vec_y  = _mm256_setzero_pd();
444     vec_y2 = _mm256_setzero_pd();
445 
446   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
447   #pragma novector
448   #pragma unroll(2)
449     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
450       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
451       aval += 4;
452       acolidx += 4;
453       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
454       aval += 4;
455       acolidx += 4;
456     }
457 
458     _mm256_storeu_pd(y + i * 8, vec_y);
459     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
460   }
461 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
462   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
463   for (i = 0; i < totalslices; i++) { /* loop over full slices */
464     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
465     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
466 
467     vec_y  = _mm256_setzero_pd();
468     vec_y2 = _mm256_setzero_pd();
469 
470     /* last slice may have padding rows. Don't use vectorization. */
471     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
472       rows_left = A->rmap->n - 8 * i;
473       for (r = 0; r < rows_left; ++r) {
474         yval       = (MatScalar)0;
475         row        = 8 * i + r;
476         nnz_in_row = a->rlen[row];
477         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
478         y[row] = yval;
479       }
480       break;
481     }
482 
483   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
484   #pragma novector
485   #pragma unroll(2)
486     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
487       vec_vals  = _mm256_loadu_pd(aval);
488       vec_x_tmp = _mm_setzero_pd();
489       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
490       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
491       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
492       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
493       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
494       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
495       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
496       aval += 4;
497 
498       vec_vals  = _mm256_loadu_pd(aval);
499       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
500       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
501       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
502       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
503       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
504       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
505       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
506       aval += 4;
507     }
508 
509     _mm256_storeu_pd(y + i * 8, vec_y);
510     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
511   }
512 #else
513   PetscCall(PetscMalloc1(sliceheight, &sum));
514   for (i = 0; i < totalslices; i++) { /* loop over slices */
515     for (j = 0; j < sliceheight; j++) {
516       sum[j] = 0.0;
517       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
518     }
519     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
520       for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
521     } else {
522       for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
523     }
524   }
525   PetscCall(PetscFree(sum));
526 #endif
527 
528   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
529   PetscCall(VecRestoreArrayRead(xx, &x));
530   PetscCall(VecRestoreArray(yy, &y));
531   PetscFunctionReturn(PETSC_SUCCESS);
532 }
533 
534 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
535 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
536 {
537   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
538   PetscScalar       *y, *z;
539   const PetscScalar *x;
540   const MatScalar   *aval        = a->val;
541   PetscInt           totalslices = a->totalslices;
542   const PetscInt    *acolidx     = a->colidx;
543   PetscInt           i, j;
544 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
545   __m512d  vec_x, vec_y, vec_vals;
546   __m256i  vec_idx;
547   __mmask8 mask = 0;
548   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
549   __m256i  vec_idx2, vec_idx3, vec_idx4;
550 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
551   __m128d   vec_x_tmp;
552   __m256d   vec_x, vec_y, vec_y2, vec_vals;
553   MatScalar yval;
554   PetscInt  r, row, nnz_in_row;
555 #else
556   PetscInt     k, sliceheight = a->sliceheight;
557   PetscScalar *sum;
558 #endif
559 
560 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
561   #pragma disjoint(*x, *y, *aval)
562 #endif
563 
564   PetscFunctionBegin;
565   if (!a->nz) {
566     PetscCall(VecCopy(yy, zz));
567     PetscFunctionReturn(PETSC_SUCCESS);
568   }
569   PetscCall(VecGetArrayRead(xx, &x));
570   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
571 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
572   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
573   for (i = 0; i < totalslices; i++) { /* loop over slices */
574     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
575     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
576 
577     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
578       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
579       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
580     } else {
581       vec_y = _mm512_loadu_pd(&y[8 * i]);
582     }
583     vec_y2 = _mm512_setzero_pd();
584     vec_y3 = _mm512_setzero_pd();
585     vec_y4 = _mm512_setzero_pd();
586 
587     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
588     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
589     case 3:
590       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
591       acolidx += 8;
592       aval += 8;
593       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
594       acolidx += 8;
595       aval += 8;
596       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
597       acolidx += 8;
598       aval += 8;
599       j += 3;
600       break;
601     case 2:
602       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
603       acolidx += 8;
604       aval += 8;
605       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
606       acolidx += 8;
607       aval += 8;
608       j += 2;
609       break;
610     case 1:
611       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
612       acolidx += 8;
613       aval += 8;
614       j += 1;
615       break;
616     }
617   #pragma novector
618     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
619       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
620       acolidx += 8;
621       aval += 8;
622       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
623       acolidx += 8;
624       aval += 8;
625       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
626       acolidx += 8;
627       aval += 8;
628       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
629       acolidx += 8;
630       aval += 8;
631     }
632 
633     vec_y = _mm512_add_pd(vec_y, vec_y2);
634     vec_y = _mm512_add_pd(vec_y, vec_y3);
635     vec_y = _mm512_add_pd(vec_y, vec_y4);
636     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
637       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
638     } else {
639       _mm512_storeu_pd(&z[8 * i], vec_y);
640     }
641   }
642 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
643   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
644   for (i = 0; i < totalslices; i++) { /* loop over full slices */
645     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
646     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
647 
648     /* last slice may have padding rows. Don't use vectorization. */
649     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
650       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
651         row        = 8 * i + r;
652         yval       = (MatScalar)0.0;
653         nnz_in_row = a->rlen[row];
654         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
655         z[row] = y[row] + yval;
656       }
657       break;
658     }
659 
660     vec_y  = _mm256_loadu_pd(y + 8 * i);
661     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
662 
663     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
664     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
665       vec_vals  = _mm256_loadu_pd(aval);
666       vec_x_tmp = _mm_setzero_pd();
667       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
668       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
669       vec_x     = _mm256_setzero_pd();
670       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
671       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
672       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
673       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
674       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
675       aval += 4;
676 
677       vec_vals  = _mm256_loadu_pd(aval);
678       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
679       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
680       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
681       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
682       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
683       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
684       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
685       aval += 4;
686     }
687 
688     _mm256_storeu_pd(z + i * 8, vec_y);
689     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
690   }
691 #else
692   PetscCall(PetscMalloc1(sliceheight, &sum));
693   for (i = 0; i < totalslices; i++) { /* loop over slices */
694     for (j = 0; j < sliceheight; j++) {
695       sum[j] = 0.0;
696       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
697     }
698     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
699       for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
700     } else {
701       for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
702     }
703   }
704   PetscCall(PetscFree(sum));
705 #endif
706 
707   PetscCall(PetscLogFlops(2.0 * a->nz));
708   PetscCall(VecRestoreArrayRead(xx, &x));
709   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
710   PetscFunctionReturn(PETSC_SUCCESS);
711 }
712 
713 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
714 {
715   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
716   PetscScalar       *y;
717   const PetscScalar *x;
718   const MatScalar   *aval    = a->val;
719   const PetscInt    *acolidx = a->colidx;
720   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
721 
722 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
723   #pragma disjoint(*x, *y, *aval)
724 #endif
725 
726   PetscFunctionBegin;
727   if (A->symmetric == PETSC_BOOL3_TRUE) {
728     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
729     PetscFunctionReturn(PETSC_SUCCESS);
730   }
731   if (zz != yy) PetscCall(VecCopy(zz, yy));
732 
733   if (a->nz) {
734     PetscCall(VecGetArrayRead(xx, &x));
735     PetscCall(VecGetArray(yy, &y));
736     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
737       if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
738         for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
739           row        = sliceheight * i + r;
740           nnz_in_row = a->rlen[row];
741           for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
742         }
743         break;
744       }
745       for (r = 0; r < sliceheight; ++r)
746         for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
747     }
748     PetscCall(PetscLogFlops(2.0 * a->nz));
749     PetscCall(VecRestoreArrayRead(xx, &x));
750     PetscCall(VecRestoreArray(yy, &y));
751   }
752   PetscFunctionReturn(PETSC_SUCCESS);
753 }
754 
755 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
756 {
757   PetscFunctionBegin;
758   if (A->symmetric == PETSC_BOOL3_TRUE) {
759     PetscCall(MatMult_SeqSELL(A, xx, yy));
760   } else {
761     PetscCall(VecSet(yy, 0.0));
762     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
763   }
764   PetscFunctionReturn(PETSC_SUCCESS);
765 }
766 
767 /*
768      Checks for missing diagonals
769 */
770 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
771 {
772   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
773   PetscInt    *diag, i;
774 
775   PetscFunctionBegin;
776   *missing = PETSC_FALSE;
777   if (A->rmap->n > 0 && !a->colidx) {
778     *missing = PETSC_TRUE;
779     if (d) *d = 0;
780     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
781   } else {
782     diag = a->diag;
783     for (i = 0; i < A->rmap->n; i++) {
784       if (diag[i] == -1) {
785         *missing = PETSC_TRUE;
786         if (d) *d = i;
787         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
788         break;
789       }
790     }
791   }
792   PetscFunctionReturn(PETSC_SUCCESS);
793 }
794 
795 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
796 {
797   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
798   PetscInt     i, j, m = A->rmap->n, shift;
799 
800   PetscFunctionBegin;
801   if (!a->diag) {
802     PetscCall(PetscMalloc1(m, &a->diag));
803     a->free_diag = PETSC_TRUE;
804   }
805   for (i = 0; i < m; i++) {                                          /* loop over rows */
806     shift      = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
807     a->diag[i] = -1;
808     for (j = 0; j < a->rlen[i]; j++) {
809       if (a->colidx[shift + a->sliceheight * j] == i) {
810         a->diag[i] = shift + a->sliceheight * j;
811         break;
812       }
813     }
814   }
815   PetscFunctionReturn(PETSC_SUCCESS);
816 }
817 
818 /*
819   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
820 */
821 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
822 {
823   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
824   PetscInt     i, *diag, m = A->rmap->n;
825   MatScalar   *val = a->val;
826   PetscScalar *idiag, *mdiag;
827 
828   PetscFunctionBegin;
829   if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
830   PetscCall(MatMarkDiagonal_SeqSELL(A));
831   diag = a->diag;
832   if (!a->idiag) {
833     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
834     val = a->val;
835   }
836   mdiag = a->mdiag;
837   idiag = a->idiag;
838 
839   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
840     for (i = 0; i < m; i++) {
841       mdiag[i] = val[diag[i]];
842       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
843         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
844         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
845         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
846         A->factorerror_zeropivot_value = 0.0;
847         A->factorerror_zeropivot_row   = i;
848       }
849       idiag[i] = 1.0 / val[diag[i]];
850     }
851     PetscCall(PetscLogFlops(m));
852   } else {
853     for (i = 0; i < m; i++) {
854       mdiag[i] = val[diag[i]];
855       idiag[i] = omega / (fshift + val[diag[i]]);
856     }
857     PetscCall(PetscLogFlops(2.0 * m));
858   }
859   a->idiagvalid = PETSC_TRUE;
860   PetscFunctionReturn(PETSC_SUCCESS);
861 }
862 
863 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
864 {
865   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
866 
867   PetscFunctionBegin;
868   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
869   PetscCall(MatSeqSELLInvalidateDiagonal(A));
870   PetscFunctionReturn(PETSC_SUCCESS);
871 }
872 
873 PetscErrorCode MatDestroy_SeqSELL(Mat A)
874 {
875   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
876 
877   PetscFunctionBegin;
878   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
879   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
880   PetscCall(ISDestroy(&a->row));
881   PetscCall(ISDestroy(&a->col));
882   PetscCall(PetscFree(a->diag));
883   PetscCall(PetscFree(a->rlen));
884   PetscCall(PetscFree(a->sliidx));
885   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
886   PetscCall(PetscFree(a->solve_work));
887   PetscCall(ISDestroy(&a->icol));
888   PetscCall(PetscFree(a->saved_values));
889   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
890   PetscCall(PetscFree(A->data));
891 #if defined(PETSC_HAVE_CUPM)
892   PetscCall(PetscFree(a->chunk_slice_map));
893 #endif
894 
895   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
896   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
897   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
898   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
899   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
900   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
901   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
902 #if defined(PETSC_HAVE_CUDA)
903   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL));
904 #endif
905 #if defined(PETSC_HAVE_HIP)
906   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellhip_C", NULL));
907 #endif
908   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
909   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
910   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
911   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL));
912   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
913   PetscFunctionReturn(PETSC_SUCCESS);
914 }
915 
916 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
917 {
918   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
919 
920   PetscFunctionBegin;
921   switch (op) {
922   case MAT_ROW_ORIENTED:
923     a->roworiented = flg;
924     break;
925   case MAT_KEEP_NONZERO_PATTERN:
926     a->keepnonzeropattern = flg;
927     break;
928   case MAT_NEW_NONZERO_LOCATIONS:
929     a->nonew = (flg ? 0 : 1);
930     break;
931   case MAT_NEW_NONZERO_LOCATION_ERR:
932     a->nonew = (flg ? -1 : 0);
933     break;
934   case MAT_NEW_NONZERO_ALLOCATION_ERR:
935     a->nonew = (flg ? -2 : 0);
936     break;
937   case MAT_UNUSED_NONZERO_LOCATION_ERR:
938     a->nounused = (flg ? -1 : 0);
939     break;
940   case MAT_FORCE_DIAGONAL_ENTRIES:
941   case MAT_IGNORE_OFF_PROC_ENTRIES:
942   case MAT_USE_HASH_TABLE:
943   case MAT_SORTED_FULL:
944     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
945     break;
946   case MAT_SPD:
947   case MAT_SYMMETRIC:
948   case MAT_STRUCTURALLY_SYMMETRIC:
949   case MAT_HERMITIAN:
950   case MAT_SYMMETRY_ETERNAL:
951   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
952   case MAT_SPD_ETERNAL:
953     /* These options are handled directly by MatSetOption() */
954     break;
955   default:
956     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
957   }
958   PetscFunctionReturn(PETSC_SUCCESS);
959 }
960 
961 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
962 {
963   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
964   PetscInt     i, j, n, shift;
965   PetscScalar *x, zero = 0.0;
966 
967   PetscFunctionBegin;
968   PetscCall(VecGetLocalSize(v, &n));
969   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
970 
971   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
972     PetscInt *diag = a->diag;
973     PetscCall(VecGetArray(v, &x));
974     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
975     PetscCall(VecRestoreArray(v, &x));
976     PetscFunctionReturn(PETSC_SUCCESS);
977   }
978 
979   PetscCall(VecSet(v, zero));
980   PetscCall(VecGetArray(v, &x));
981   for (i = 0; i < n; i++) {                                     /* loop over rows */
982     shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
983     x[i]  = 0;
984     for (j = 0; j < a->rlen[i]; j++) {
985       if (a->colidx[shift + a->sliceheight * j] == i) {
986         x[i] = a->val[shift + a->sliceheight * j];
987         break;
988       }
989     }
990   }
991   PetscCall(VecRestoreArray(v, &x));
992   PetscFunctionReturn(PETSC_SUCCESS);
993 }
994 
995 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
996 {
997   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
998   const PetscScalar *l, *r;
999   PetscInt           i, j, m, n, row;
1000 
1001   PetscFunctionBegin;
1002   if (ll) {
1003     /* The local size is used so that VecMPI can be passed to this routine
1004        by MatDiagonalScale_MPISELL */
1005     PetscCall(VecGetLocalSize(ll, &m));
1006     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1007     PetscCall(VecGetArrayRead(ll, &l));
1008     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1009       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1010         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1011           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
1012         }
1013       } else {
1014         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; }
1015       }
1016     }
1017     PetscCall(VecRestoreArrayRead(ll, &l));
1018     PetscCall(PetscLogFlops(a->nz));
1019   }
1020   if (rr) {
1021     PetscCall(VecGetLocalSize(rr, &n));
1022     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1023     PetscCall(VecGetArrayRead(rr, &r));
1024     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1025       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1026         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1027           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1028         }
1029       } else {
1030         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1031       }
1032     }
1033     PetscCall(VecRestoreArrayRead(rr, &r));
1034     PetscCall(PetscLogFlops(a->nz));
1035   }
1036   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1037 #if defined(PETSC_HAVE_CUPM)
1038   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1039 #endif
1040   PetscFunctionReturn(PETSC_SUCCESS);
1041 }
1042 
1043 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1044 {
1045   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1046   PetscInt    *cp, i, k, low, high, t, row, col, l;
1047   PetscInt     shift;
1048   MatScalar   *vp;
1049 
1050   PetscFunctionBegin;
1051   for (k = 0; k < m; k++) { /* loop over requested rows */
1052     row = im[k];
1053     if (row < 0) continue;
1054     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1055     shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1056     cp    = a->colidx + shift;                                        /* pointer to the row */
1057     vp    = a->val + shift;                                           /* pointer to the row */
1058     for (l = 0; l < n; l++) {                                         /* loop over requested columns */
1059       col = in[l];
1060       if (col < 0) continue;
1061       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1062       high = a->rlen[row];
1063       low  = 0; /* assume unsorted */
1064       while (high - low > 5) {
1065         t = (low + high) / 2;
1066         if (*(cp + a->sliceheight * t) > col) high = t;
1067         else low = t;
1068       }
1069       for (i = low; i < high; i++) {
1070         if (*(cp + a->sliceheight * i) > col) break;
1071         if (*(cp + a->sliceheight * i) == col) {
1072           *v++ = *(vp + a->sliceheight * i);
1073           goto finished;
1074         }
1075       }
1076       *v++ = 0.0;
1077     finished:;
1078     }
1079   }
1080   PetscFunctionReturn(PETSC_SUCCESS);
1081 }
1082 
1083 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1084 {
1085   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1086   PetscInt          i, j, m = A->rmap->n, shift;
1087   const char       *name;
1088   PetscViewerFormat format;
1089 
1090   PetscFunctionBegin;
1091   PetscCall(PetscViewerGetFormat(viewer, &format));
1092   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1093     PetscInt nofinalvalue = 0;
1094     /*
1095     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1096       nofinalvalue = 1;
1097     }
1098     */
1099     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1100     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1101     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1102 #if defined(PETSC_USE_COMPLEX)
1103     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1104 #else
1105     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1106 #endif
1107     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1108 
1109     for (i = 0; i < m; i++) {
1110       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1111       for (j = 0; j < a->rlen[i]; j++) {
1112 #if defined(PETSC_USE_COMPLEX)
1113         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1114 #else
1115         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1116 #endif
1117       }
1118     }
1119     /*
1120     if (nofinalvalue) {
1121 #if defined(PETSC_USE_COMPLEX)
1122       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1123 #else
1124       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1125 #endif
1126     }
1127     */
1128     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1129     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1130     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1131   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1132     PetscFunctionReturn(PETSC_SUCCESS);
1133   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1134     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1135     for (i = 0; i < m; i++) {
1136       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1137       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1138       for (j = 0; j < a->rlen[i]; j++) {
1139 #if defined(PETSC_USE_COMPLEX)
1140         if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1141           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1142         } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1143           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1144         } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1145           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1146         }
1147 #else
1148         if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1149 #endif
1150       }
1151       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1152     }
1153     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1154   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1155     PetscInt    cnt = 0, jcnt;
1156     PetscScalar value;
1157 #if defined(PETSC_USE_COMPLEX)
1158     PetscBool realonly = PETSC_TRUE;
1159     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1160       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1161         realonly = PETSC_FALSE;
1162         break;
1163       }
1164     }
1165 #endif
1166 
1167     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1168     for (i = 0; i < m; i++) {
1169       jcnt  = 0;
1170       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1171       for (j = 0; j < A->cmap->n; j++) {
1172         if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1173           value = a->val[cnt++];
1174           jcnt++;
1175         } else {
1176           value = 0.0;
1177         }
1178 #if defined(PETSC_USE_COMPLEX)
1179         if (realonly) {
1180           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1181         } else {
1182           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1183         }
1184 #else
1185         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1186 #endif
1187       }
1188       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1189     }
1190     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1191   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1192     PetscInt fshift = 1;
1193     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1194 #if defined(PETSC_USE_COMPLEX)
1195     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1196 #else
1197     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1198 #endif
1199     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1200     for (i = 0; i < m; i++) {
1201       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1202       for (j = 0; j < a->rlen[i]; j++) {
1203 #if defined(PETSC_USE_COMPLEX)
1204         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1205 #else
1206         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1207 #endif
1208       }
1209     }
1210     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1211   } else if (format == PETSC_VIEWER_NATIVE) {
1212     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1213       PetscInt row;
1214       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1215       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1216 #if defined(PETSC_USE_COMPLEX)
1217         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1218           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1219         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1220           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1221         } else {
1222           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1223         }
1224 #else
1225         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1226 #endif
1227       }
1228     }
1229   } else {
1230     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1231     if (A->factortype) {
1232       for (i = 0; i < m; i++) {
1233         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1234         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1235         /* L part */
1236         for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1237 #if defined(PETSC_USE_COMPLEX)
1238           if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1239             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1240           } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1241             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1242           } else {
1243             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1244           }
1245 #else
1246           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1247 #endif
1248         }
1249         /* diagonal */
1250         j = a->diag[i];
1251 #if defined(PETSC_USE_COMPLEX)
1252         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1253           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1254         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1255           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1256         } else {
1257           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1258         }
1259 #else
1260         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1 / a->val[j])));
1261 #endif
1262 
1263         /* U part */
1264         for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1265 #if defined(PETSC_USE_COMPLEX)
1266           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1267             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1268           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1269             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1270           } else {
1271             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1272           }
1273 #else
1274           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1275 #endif
1276         }
1277         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1278       }
1279     } else {
1280       for (i = 0; i < m; i++) {
1281         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1282         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1283         for (j = 0; j < a->rlen[i]; j++) {
1284 #if defined(PETSC_USE_COMPLEX)
1285           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1286             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1287           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1288             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1289           } else {
1290             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1291           }
1292 #else
1293           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1294 #endif
1295         }
1296         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1297       }
1298     }
1299     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1300   }
1301   PetscCall(PetscViewerFlush(viewer));
1302   PetscFunctionReturn(PETSC_SUCCESS);
1303 }
1304 
1305 #include <petscdraw.h>
1306 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1307 {
1308   Mat               A = (Mat)Aa;
1309   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1310   PetscInt          i, j, m = A->rmap->n, shift;
1311   int               color;
1312   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1313   PetscViewer       viewer;
1314   PetscViewerFormat format;
1315 
1316   PetscFunctionBegin;
1317   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1318   PetscCall(PetscViewerGetFormat(viewer, &format));
1319   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1320 
1321   /* loop over matrix elements drawing boxes */
1322 
1323   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1324     PetscDrawCollectiveBegin(draw);
1325     /* Blue for negative, Cyan for zero and  Red for positive */
1326     color = PETSC_DRAW_BLUE;
1327     for (i = 0; i < m; i++) {
1328       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1329       y_l   = m - i - 1.0;
1330       y_r   = y_l + 1.0;
1331       for (j = 0; j < a->rlen[i]; j++) {
1332         x_l = a->colidx[shift + a->sliceheight * j];
1333         x_r = x_l + 1.0;
1334         if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1335         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1336       }
1337     }
1338     color = PETSC_DRAW_CYAN;
1339     for (i = 0; i < m; i++) {
1340       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1341       y_l   = m - i - 1.0;
1342       y_r   = y_l + 1.0;
1343       for (j = 0; j < a->rlen[i]; j++) {
1344         x_l = a->colidx[shift + a->sliceheight * j];
1345         x_r = x_l + 1.0;
1346         if (a->val[shift + a->sliceheight * j] != 0.) continue;
1347         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1348       }
1349     }
1350     color = PETSC_DRAW_RED;
1351     for (i = 0; i < m; i++) {
1352       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1353       y_l   = m - i - 1.0;
1354       y_r   = y_l + 1.0;
1355       for (j = 0; j < a->rlen[i]; j++) {
1356         x_l = a->colidx[shift + a->sliceheight * j];
1357         x_r = x_l + 1.0;
1358         if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1359         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1360       }
1361     }
1362     PetscDrawCollectiveEnd(draw);
1363   } else {
1364     /* use contour shading to indicate magnitude of values */
1365     /* first determine max of all nonzero values */
1366     PetscReal minv = 0.0, maxv = 0.0;
1367     PetscInt  count = 0;
1368     PetscDraw popup;
1369     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1370       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1371     }
1372     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1373     PetscCall(PetscDrawGetPopup(draw, &popup));
1374     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1375 
1376     PetscDrawCollectiveBegin(draw);
1377     for (i = 0; i < m; i++) {
1378       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1379       y_l   = m - i - 1.0;
1380       y_r   = y_l + 1.0;
1381       for (j = 0; j < a->rlen[i]; j++) {
1382         x_l   = a->colidx[shift + a->sliceheight * j];
1383         x_r   = x_l + 1.0;
1384         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1385         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1386         count++;
1387       }
1388     }
1389     PetscDrawCollectiveEnd(draw);
1390   }
1391   PetscFunctionReturn(PETSC_SUCCESS);
1392 }
1393 
1394 #include <petscdraw.h>
1395 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1396 {
1397   PetscDraw draw;
1398   PetscReal xr, yr, xl, yl, h, w;
1399   PetscBool isnull;
1400 
1401   PetscFunctionBegin;
1402   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1403   PetscCall(PetscDrawIsNull(draw, &isnull));
1404   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1405 
1406   xr = A->cmap->n;
1407   yr = A->rmap->n;
1408   h  = yr / 10.0;
1409   w  = xr / 10.0;
1410   xr += w;
1411   yr += h;
1412   xl = -w;
1413   yl = -h;
1414   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1415   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1416   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1417   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1418   PetscCall(PetscDrawSave(draw));
1419   PetscFunctionReturn(PETSC_SUCCESS);
1420 }
1421 
1422 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1423 {
1424   PetscBool iascii, isbinary, isdraw;
1425 
1426   PetscFunctionBegin;
1427   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1428   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1429   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1430   if (iascii) {
1431     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1432   } else if (isbinary) {
1433     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1434   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1435   PetscFunctionReturn(PETSC_SUCCESS);
1436 }
1437 
1438 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1439 {
1440   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1441   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1442   MatScalar   *vp;
1443 #if defined(PETSC_HAVE_CUPM)
1444   PetscInt totalchunks = 0;
1445 #endif
1446 
1447   PetscFunctionBegin;
1448   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1449   /* To do: compress out the unused elements */
1450   PetscCall(MatMarkDiagonal_SeqSELL(A));
1451   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1452   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1453   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1454   a->nonzerorowcnt = 0;
1455   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1456   for (i = 0; i < a->totalslices; ++i) {
1457     shift = a->sliidx[i];                                                   /* starting index of the slice */
1458     cp    = PetscSafePointerPlusOffset(a->colidx, shift);                   /* pointer to the column indices of the slice */
1459     vp    = PetscSafePointerPlusOffset(a->val, shift);                      /* pointer to the nonzero values of the slice */
1460     for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1461       row  = a->sliceheight * i + row_in_slice;
1462       nrow = a->rlen[row]; /* number of nonzeros in row */
1463       /*
1464         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1465         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1466       */
1467       lastcol = 0;
1468       if (nrow > 0) { /* nonempty row */
1469         a->nonzerorowcnt++;
1470         lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1471       } else if (!row_in_slice) {                                 /* first row of the correct slice is empty */
1472         for (j = 1; j < a->sliceheight; j++) {
1473           if (a->rlen[a->sliceheight * i + j]) {
1474             lastcol = cp[j];
1475             break;
1476           }
1477         }
1478       } else {
1479         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1480       }
1481 
1482       for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1483         cp[a->sliceheight * k + row_in_slice] = lastcol;
1484         vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1485       }
1486     }
1487   }
1488 
1489   A->info.mallocs += a->reallocs;
1490   a->reallocs = 0;
1491 
1492   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1493 #if defined(PETSC_HAVE_CUPM)
1494   if (!a->chunksize && a->totalslices) {
1495     a->chunksize = 64;
1496     while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2;
1497     totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize;
1498   }
1499   if (totalchunks != a->totalchunks) {
1500     PetscCall(PetscFree(a->chunk_slice_map));
1501     PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map));
1502     a->totalchunks = totalchunks;
1503   }
1504   j = 0;
1505   for (i = 0; i < totalchunks; i++) {
1506     while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++;
1507     a->chunk_slice_map[i] = j;
1508   }
1509 #endif
1510   PetscFunctionReturn(PETSC_SUCCESS);
1511 }
1512 
1513 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1514 {
1515   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1516 
1517   PetscFunctionBegin;
1518   info->block_size   = 1.0;
1519   info->nz_allocated = a->maxallocmat;
1520   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1521   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1522   info->assemblies   = A->num_ass;
1523   info->mallocs      = A->info.mallocs;
1524   info->memory       = 0; /* REVIEW ME */
1525   if (A->factortype) {
1526     info->fill_ratio_given  = A->info.fill_ratio_given;
1527     info->fill_ratio_needed = A->info.fill_ratio_needed;
1528     info->factor_mallocs    = A->info.factor_mallocs;
1529   } else {
1530     info->fill_ratio_given  = 0;
1531     info->fill_ratio_needed = 0;
1532     info->factor_mallocs    = 0;
1533   }
1534   PetscFunctionReturn(PETSC_SUCCESS);
1535 }
1536 
1537 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1538 {
1539   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1540   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1541   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1542   MatScalar   *vp, value;
1543 #if defined(PETSC_HAVE_CUPM)
1544   PetscBool inserted = PETSC_FALSE;
1545   PetscInt  mul      = DEVICE_MEM_ALIGN / a->sliceheight;
1546 #endif
1547 
1548   PetscFunctionBegin;
1549   for (k = 0; k < m; k++) { /* loop over added rows */
1550     row = im[k];
1551     if (row < 0) continue;
1552     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1553     shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1554     cp    = a->colidx + shift;                                      /* pointer to the row */
1555     vp    = a->val + shift;                                         /* pointer to the row */
1556     nrow  = a->rlen[row];
1557     low   = 0;
1558     high  = nrow;
1559 
1560     for (l = 0; l < n; l++) { /* loop over added columns */
1561       col = in[l];
1562       if (col < 0) continue;
1563       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1564       if (a->roworiented) {
1565         value = v[l + k * n];
1566       } else {
1567         value = v[k + l * m];
1568       }
1569       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1570 
1571       /* search in this row for the specified column, i indicates the column to be set */
1572       if (col <= lastcol) low = 0;
1573       else high = nrow;
1574       lastcol = col;
1575       while (high - low > 5) {
1576         t = (low + high) / 2;
1577         if (*(cp + a->sliceheight * t) > col) high = t;
1578         else low = t;
1579       }
1580       for (i = low; i < high; i++) {
1581         if (*(cp + a->sliceheight * i) > col) break;
1582         if (*(cp + a->sliceheight * i) == col) {
1583           if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1584           else *(vp + a->sliceheight * i) = value;
1585 #if defined(PETSC_HAVE_CUPM)
1586           inserted = PETSC_TRUE;
1587 #endif
1588           low = i + 1;
1589           goto noinsert;
1590         }
1591       }
1592       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1593       if (nonew == 1) goto noinsert;
1594       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1595 #if defined(PETSC_HAVE_CUPM)
1596       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1597 #else
1598       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1599       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1600 #endif
1601       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1602       for (ii = nrow - 1; ii >= i; ii--) {
1603         *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1604         *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1605       }
1606       a->rlen[row]++;
1607       *(cp + a->sliceheight * i) = col;
1608       *(vp + a->sliceheight * i) = value;
1609       a->nz++;
1610 #if defined(PETSC_HAVE_CUPM)
1611       inserted = PETSC_TRUE;
1612 #endif
1613       low = i + 1;
1614       high++;
1615       nrow++;
1616     noinsert:;
1617     }
1618     a->rlen[row] = nrow;
1619   }
1620 #if defined(PETSC_HAVE_CUPM)
1621   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1622 #endif
1623   PetscFunctionReturn(PETSC_SUCCESS);
1624 }
1625 
1626 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1627 {
1628   PetscFunctionBegin;
1629   /* If the two matrices have the same copy implementation, use fast copy. */
1630   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1631     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1632     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1633 
1634     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1635     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1636   } else {
1637     PetscCall(MatCopy_Basic(A, B, str));
1638   }
1639   PetscFunctionReturn(PETSC_SUCCESS);
1640 }
1641 
1642 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1643 {
1644   PetscFunctionBegin;
1645   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1646   PetscFunctionReturn(PETSC_SUCCESS);
1647 }
1648 
1649 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1650 {
1651   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1652 
1653   PetscFunctionBegin;
1654   *array = a->val;
1655   PetscFunctionReturn(PETSC_SUCCESS);
1656 }
1657 
1658 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1659 {
1660   PetscFunctionBegin;
1661   PetscFunctionReturn(PETSC_SUCCESS);
1662 }
1663 
1664 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1665 {
1666   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1667   MatScalar   *aval   = a->val;
1668   PetscScalar  oalpha = alpha;
1669   PetscBLASInt one    = 1, size;
1670 
1671   PetscFunctionBegin;
1672   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1673   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1674   PetscCall(PetscLogFlops(a->nz));
1675   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1676 #if defined(PETSC_HAVE_CUPM)
1677   if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1678 #endif
1679   PetscFunctionReturn(PETSC_SUCCESS);
1680 }
1681 
1682 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1683 {
1684   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1685 
1686   PetscFunctionBegin;
1687   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1688   PetscCall(MatShift_Basic(Y, a));
1689   PetscFunctionReturn(PETSC_SUCCESS);
1690 }
1691 
1692 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1693 {
1694   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1695   PetscScalar       *x, sum, *t;
1696   const MatScalar   *idiag = NULL, *mdiag;
1697   const PetscScalar *b, *xb;
1698   PetscInt           n, m = A->rmap->n, i, j, shift;
1699   const PetscInt    *diag;
1700 
1701   PetscFunctionBegin;
1702   its = its * lits;
1703 
1704   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1705   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1706   a->fshift = fshift;
1707   a->omega  = omega;
1708 
1709   diag  = a->diag;
1710   t     = a->ssor_work;
1711   idiag = a->idiag;
1712   mdiag = a->mdiag;
1713 
1714   PetscCall(VecGetArray(xx, &x));
1715   PetscCall(VecGetArrayRead(bb, &b));
1716   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1717   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1718   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1719   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1720 
1721   if (flag & SOR_ZERO_INITIAL_GUESS) {
1722     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1723       for (i = 0; i < m; i++) {
1724         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1725         sum   = b[i];
1726         n     = (diag[i] - shift) / a->sliceheight;
1727         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1728         t[i] = sum;
1729         x[i] = sum * idiag[i];
1730       }
1731       xb = t;
1732       PetscCall(PetscLogFlops(a->nz));
1733     } else xb = b;
1734     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1735       for (i = m - 1; i >= 0; i--) {
1736         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1737         sum   = xb[i];
1738         n     = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1739         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1740         if (xb == b) {
1741           x[i] = sum * idiag[i];
1742         } else {
1743           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1744         }
1745       }
1746       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1747     }
1748     its--;
1749   }
1750   while (its--) {
1751     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1752       for (i = 0; i < m; i++) {
1753         /* lower */
1754         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1755         sum   = b[i];
1756         n     = (diag[i] - shift) / a->sliceheight;
1757         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1758         t[i] = sum; /* save application of the lower-triangular part */
1759         /* upper */
1760         n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1761         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1762         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1763       }
1764       xb = t;
1765       PetscCall(PetscLogFlops(2.0 * a->nz));
1766     } else xb = b;
1767     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1768       for (i = m - 1; i >= 0; i--) {
1769         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1770         sum   = xb[i];
1771         if (xb == b) {
1772           /* whole matrix (no checkpointing available) */
1773           n = a->rlen[i];
1774           for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1775           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1776         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1777           n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1778           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1779           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1780         }
1781       }
1782       if (xb == b) {
1783         PetscCall(PetscLogFlops(2.0 * a->nz));
1784       } else {
1785         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1786       }
1787     }
1788   }
1789   PetscCall(VecRestoreArray(xx, &x));
1790   PetscCall(VecRestoreArrayRead(bb, &b));
1791   PetscFunctionReturn(PETSC_SUCCESS);
1792 }
1793 
1794 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1795                                        MatGetRow_SeqSELL,
1796                                        MatRestoreRow_SeqSELL,
1797                                        MatMult_SeqSELL,
1798                                        /* 4*/ MatMultAdd_SeqSELL,
1799                                        MatMultTranspose_SeqSELL,
1800                                        MatMultTransposeAdd_SeqSELL,
1801                                        NULL,
1802                                        NULL,
1803                                        NULL,
1804                                        /* 10*/ NULL,
1805                                        NULL,
1806                                        NULL,
1807                                        MatSOR_SeqSELL,
1808                                        NULL,
1809                                        /* 15*/ MatGetInfo_SeqSELL,
1810                                        MatEqual_SeqSELL,
1811                                        MatGetDiagonal_SeqSELL,
1812                                        MatDiagonalScale_SeqSELL,
1813                                        NULL,
1814                                        /* 20*/ NULL,
1815                                        MatAssemblyEnd_SeqSELL,
1816                                        MatSetOption_SeqSELL,
1817                                        MatZeroEntries_SeqSELL,
1818                                        /* 24*/ NULL,
1819                                        NULL,
1820                                        NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        /* 29*/ MatSetUp_SeqSELL,
1824                                        NULL,
1825                                        NULL,
1826                                        NULL,
1827                                        NULL,
1828                                        /* 34*/ MatDuplicate_SeqSELL,
1829                                        NULL,
1830                                        NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        /* 39*/ NULL,
1834                                        NULL,
1835                                        NULL,
1836                                        MatGetValues_SeqSELL,
1837                                        MatCopy_SeqSELL,
1838                                        /* 44*/ NULL,
1839                                        MatScale_SeqSELL,
1840                                        MatShift_SeqSELL,
1841                                        NULL,
1842                                        NULL,
1843                                        /* 49*/ NULL,
1844                                        NULL,
1845                                        NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1849                                        NULL,
1850                                        NULL,
1851                                        NULL,
1852                                        NULL,
1853                                        /* 59*/ NULL,
1854                                        MatDestroy_SeqSELL,
1855                                        MatView_SeqSELL,
1856                                        NULL,
1857                                        NULL,
1858                                        /* 64*/ NULL,
1859                                        NULL,
1860                                        NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        /* 69*/ NULL,
1864                                        NULL,
1865                                        NULL,
1866                                        NULL,
1867                                        NULL,
1868                                        /* 74*/ NULL,
1869                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1870                                        NULL,
1871                                        NULL,
1872                                        NULL,
1873                                        /* 79*/ NULL,
1874                                        NULL,
1875                                        NULL,
1876                                        NULL,
1877                                        NULL,
1878                                        /* 84*/ NULL,
1879                                        NULL,
1880                                        NULL,
1881                                        NULL,
1882                                        NULL,
1883                                        /* 89*/ NULL,
1884                                        NULL,
1885                                        NULL,
1886                                        NULL,
1887                                        NULL,
1888                                        /* 94*/ NULL,
1889                                        NULL,
1890                                        NULL,
1891                                        NULL,
1892                                        NULL,
1893                                        /* 99*/ NULL,
1894                                        NULL,
1895                                        NULL,
1896                                        MatConjugate_SeqSELL,
1897                                        NULL,
1898                                        /*104*/ NULL,
1899                                        NULL,
1900                                        NULL,
1901                                        NULL,
1902                                        NULL,
1903                                        /*109*/ NULL,
1904                                        NULL,
1905                                        NULL,
1906                                        NULL,
1907                                        MatMissingDiagonal_SeqSELL,
1908                                        /*114*/ NULL,
1909                                        NULL,
1910                                        NULL,
1911                                        NULL,
1912                                        NULL,
1913                                        /*119*/ NULL,
1914                                        NULL,
1915                                        NULL,
1916                                        NULL,
1917                                        NULL,
1918                                        /*124*/ NULL,
1919                                        NULL,
1920                                        NULL,
1921                                        NULL,
1922                                        NULL,
1923                                        /*129*/ NULL,
1924                                        NULL,
1925                                        NULL,
1926                                        NULL,
1927                                        NULL,
1928                                        /*134*/ NULL,
1929                                        NULL,
1930                                        NULL,
1931                                        NULL,
1932                                        NULL,
1933                                        /*139*/ NULL,
1934                                        NULL,
1935                                        NULL,
1936                                        MatFDColoringSetUp_SeqXAIJ,
1937                                        NULL,
1938                                        /*144*/ NULL,
1939                                        NULL,
1940                                        NULL,
1941                                        NULL,
1942                                        NULL,
1943                                        NULL,
1944                                        /*150*/ NULL,
1945                                        NULL,
1946                                        NULL,
1947                                        NULL,
1948                                        NULL,
1949                                        /*155*/ NULL,
1950                                        NULL};
1951 
1952 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1953 {
1954   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1955 
1956   PetscFunctionBegin;
1957   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1958 
1959   /* allocate space for values if not already there */
1960   if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1961 
1962   /* copy values over */
1963   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1964   PetscFunctionReturn(PETSC_SUCCESS);
1965 }
1966 
1967 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1968 {
1969   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1970 
1971   PetscFunctionBegin;
1972   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1973   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1974   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1975   PetscFunctionReturn(PETSC_SUCCESS);
1976 }
1977 
1978 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
1979 {
1980   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1981 
1982   PetscFunctionBegin;
1983   if (a->totalslices && a->sliidx[a->totalslices]) {
1984     *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
1985   } else {
1986     *ratio = 0.0;
1987   }
1988   PetscFunctionReturn(PETSC_SUCCESS);
1989 }
1990 
1991 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
1992 {
1993   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1994   PetscInt     i, current_slicewidth;
1995 
1996   PetscFunctionBegin;
1997   *slicewidth = 0;
1998   for (i = 0; i < a->totalslices; i++) {
1999     current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
2000     if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
2001   }
2002   PetscFunctionReturn(PETSC_SUCCESS);
2003 }
2004 
2005 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
2006 {
2007   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2008 
2009   PetscFunctionBegin;
2010   *slicewidth = 0;
2011   if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; }
2012   PetscFunctionReturn(PETSC_SUCCESS);
2013 }
2014 
2015 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance)
2016 {
2017   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2018   PetscReal    mean;
2019   PetscInt     i, totalslices = a->totalslices, *sliidx = a->sliidx;
2020 
2021   PetscFunctionBegin;
2022   *variance = 0;
2023   if (totalslices) {
2024     mean = (PetscReal)sliidx[totalslices] / totalslices;
2025     for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; }
2026   }
2027   PetscFunctionReturn(PETSC_SUCCESS);
2028 }
2029 
2030 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2031 {
2032   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2033 
2034   PetscFunctionBegin;
2035   if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2036   PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2037   a->sliceheight = sliceheight;
2038 #if defined(PETSC_HAVE_CUPM)
2039   PetscCheck(PetscMax(DEVICE_MEM_ALIGN, sliceheight) % PetscMin(DEVICE_MEM_ALIGN, sliceheight) == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The slice height is not compatible with DEVICE_MEM_ALIGN (one must be divisible by the other) %" PetscInt_FMT, sliceheight);
2040 #endif
2041   PetscFunctionReturn(PETSC_SUCCESS);
2042 }
2043 
2044 /*@
2045   MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2046 
2047   Not Collective
2048 
2049   Input Parameter:
2050 . A - a MATSEQSELL matrix
2051 
2052   Output Parameter:
2053 . ratio - ratio of number of padded zeros to number of allocated elements
2054 
2055   Level: intermediate
2056 
2057 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2058 @*/
2059 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2060 {
2061   PetscFunctionBegin;
2062   PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2063   PetscFunctionReturn(PETSC_SUCCESS);
2064 }
2065 
2066 /*@
2067   MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2068 
2069   Not Collective
2070 
2071   Input Parameter:
2072 . A - a MATSEQSELL matrix
2073 
2074   Output Parameter:
2075 . slicewidth - maximum slice width
2076 
2077   Level: intermediate
2078 
2079 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2080 @*/
2081 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2082 {
2083   PetscFunctionBegin;
2084   PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2085   PetscFunctionReturn(PETSC_SUCCESS);
2086 }
2087 
2088 /*@
2089   MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2090 
2091   Not Collective
2092 
2093   Input Parameter:
2094 . A - a MATSEQSELL matrix
2095 
2096   Output Parameter:
2097 . slicewidth - average slice width
2098 
2099   Level: intermediate
2100 
2101 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()`
2102 @*/
2103 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2104 {
2105   PetscFunctionBegin;
2106   PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2107   PetscFunctionReturn(PETSC_SUCCESS);
2108 }
2109 
2110 /*@
2111   MatSeqSELLSetSliceHeight - sets the slice height.
2112 
2113   Not Collective
2114 
2115   Input Parameters:
2116 + A           - a MATSEQSELL matrix
2117 - sliceheight - slice height
2118 
2119   Notes:
2120   You cannot change the slice height once it have been set.
2121 
2122   The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2123 
2124   Level: intermediate
2125 
2126 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()`
2127 @*/
2128 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2129 {
2130   PetscFunctionBegin;
2131   PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2132   PetscFunctionReturn(PETSC_SUCCESS);
2133 }
2134 
2135 /*@
2136   MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2137 
2138   Not Collective
2139 
2140   Input Parameter:
2141 . A - a MATSEQSELL matrix
2142 
2143   Output Parameter:
2144 . variance - variance of the slice size
2145 
2146   Level: intermediate
2147 
2148 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()`
2149 @*/
2150 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2151 {
2152   PetscFunctionBegin;
2153   PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2154   PetscFunctionReturn(PETSC_SUCCESS);
2155 }
2156 
2157 #if defined(PETSC_HAVE_CUDA)
2158 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2159 #endif
2160 #if defined(PETSC_HAVE_HIP)
2161 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat);
2162 #endif
2163 
2164 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2165 {
2166   Mat_SeqSELL *b;
2167   PetscMPIInt  size;
2168 
2169   PetscFunctionBegin;
2170   PetscCall(PetscCitationsRegister(citation, &cited));
2171   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2172   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2173 
2174   PetscCall(PetscNew(&b));
2175 
2176   B->data   = (void *)b;
2177   B->ops[0] = MatOps_Values;
2178 
2179   b->row                = NULL;
2180   b->col                = NULL;
2181   b->icol               = NULL;
2182   b->reallocs           = 0;
2183   b->ignorezeroentries  = PETSC_FALSE;
2184   b->roworiented        = PETSC_TRUE;
2185   b->nonew              = 0;
2186   b->diag               = NULL;
2187   b->solve_work         = NULL;
2188   B->spptr              = NULL;
2189   b->saved_values       = NULL;
2190   b->idiag              = NULL;
2191   b->mdiag              = NULL;
2192   b->ssor_work          = NULL;
2193   b->omega              = 1.0;
2194   b->fshift             = 0.0;
2195   b->idiagvalid         = PETSC_FALSE;
2196   b->keepnonzeropattern = PETSC_FALSE;
2197   b->sliceheight        = 0;
2198 
2199   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2200   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2201   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2202   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2203   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2204   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2205   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
2206 #if defined(PETSC_HAVE_CUDA)
2207   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA));
2208 #endif
2209 #if defined(PETSC_HAVE_HIP)
2210   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellhip_C", MatConvert_SeqSELL_SeqSELLHIP));
2211 #endif
2212   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2213   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2214   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2215   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL));
2216   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2217 
2218   PetscObjectOptionsBegin((PetscObject)B);
2219   {
2220     PetscInt  newsh = -1;
2221     PetscBool flg;
2222 #if defined(PETSC_HAVE_CUPM)
2223     PetscInt chunksize = 0;
2224 #endif
2225 
2226     PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2227     if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); }
2228 #if defined(PETSC_HAVE_CUPM)
2229     PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA/HIP kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg));
2230     if (flg) {
2231       PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize);
2232       b->chunksize = chunksize;
2233     }
2234 #endif
2235   }
2236   PetscOptionsEnd();
2237   PetscFunctionReturn(PETSC_SUCCESS);
2238 }
2239 
2240 /*
2241  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2242  */
2243 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2244 {
2245   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2246   PetscInt     i, m                           = A->rmap->n;
2247   PetscInt     totalslices = a->totalslices;
2248 
2249   PetscFunctionBegin;
2250   C->factortype = A->factortype;
2251   c->row        = NULL;
2252   c->col        = NULL;
2253   c->icol       = NULL;
2254   c->reallocs   = 0;
2255   C->assembled  = PETSC_TRUE;
2256 
2257   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2258   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2259 
2260   c->sliceheight = a->sliceheight;
2261   PetscCall(PetscMalloc1(c->sliceheight * totalslices, &c->rlen));
2262   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2263 
2264   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2265   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2266 
2267   /* allocate the matrix space */
2268   if (mallocmatspace) {
2269     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2270 
2271     c->singlemalloc = PETSC_TRUE;
2272 
2273     if (m > 0) {
2274       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2275       if (cpvalues == MAT_COPY_VALUES) {
2276         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2277       } else {
2278         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2279       }
2280     }
2281   }
2282 
2283   c->ignorezeroentries = a->ignorezeroentries;
2284   c->roworiented       = a->roworiented;
2285   c->nonew             = a->nonew;
2286   if (a->diag) {
2287     PetscCall(PetscMalloc1(m, &c->diag));
2288     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2289   } else c->diag = NULL;
2290 
2291   c->solve_work         = NULL;
2292   c->saved_values       = NULL;
2293   c->idiag              = NULL;
2294   c->ssor_work          = NULL;
2295   c->keepnonzeropattern = a->keepnonzeropattern;
2296   c->free_val           = PETSC_TRUE;
2297   c->free_colidx        = PETSC_TRUE;
2298 
2299   c->maxallocmat  = a->maxallocmat;
2300   c->maxallocrow  = a->maxallocrow;
2301   c->rlenmax      = a->rlenmax;
2302   c->nz           = a->nz;
2303   C->preallocated = PETSC_TRUE;
2304 
2305   c->nonzerorowcnt = a->nonzerorowcnt;
2306   C->nonzerostate  = A->nonzerostate;
2307 
2308   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2309   PetscFunctionReturn(PETSC_SUCCESS);
2310 }
2311 
2312 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2313 {
2314   PetscFunctionBegin;
2315   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2316   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2317   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2318   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2319   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2320   PetscFunctionReturn(PETSC_SUCCESS);
2321 }
2322 
2323 /*MC
2324    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2325    based on the sliced Ellpack format, {cite}`zhangellpack2018`
2326 
2327    Options Database Key:
2328 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2329 
2330    Level: beginner
2331 
2332 .seealso: `Mat`, `MatCreateSeqSELL()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2333 M*/
2334 
2335 /*MC
2336    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018`
2337 
2338    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2339    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2340   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2341   for communicators controlling multiple processes.  It is recommended that you call both of
2342   the above preallocation routines for simplicity.
2343 
2344    Options Database Key:
2345 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2346 
2347   Level: beginner
2348 
2349   Notes:
2350   This format is only supported for real scalars, double precision, and 32-bit indices (the defaults).
2351 
2352   It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2353   non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2354 
2355   Developer Notes:
2356   On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2357 
2358   The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2359 .vb
2360                             (2 0  3 4)
2361    Consider the matrix A =  (5 0  6 0)
2362                             (0 0  7 8)
2363                             (0 0  9 9)
2364 
2365    symbolically the Ellpack format can be written as
2366 
2367         (2 3 4 |)           (0 2 3 |)
2368    v =  (5 6 0 |)  colidx = (0 2 2 |)
2369         --------            ---------
2370         (7 8 |)             (2 3 |)
2371         (9 9 |)             (2 3 |)
2372 
2373     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2374     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2375     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2376 
2377     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2378 
2379 .ve
2380 
2381     See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance.
2382 
2383 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSELL()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2384 M*/
2385 
2386 /*@
2387   MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2388 
2389   Collective
2390 
2391   Input Parameters:
2392 + comm    - MPI communicator, set to `PETSC_COMM_SELF`
2393 . m       - number of rows
2394 . n       - number of columns
2395 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2396 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2397 
2398   Output Parameter:
2399 . A - the matrix
2400 
2401   Level: intermediate
2402 
2403   Notes:
2404   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2405   MatXXXXSetPreallocation() paradigm instead of this routine directly.
2406   [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2407 
2408   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2409   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2410   allocation.
2411 
2412 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL`
2413  @*/
2414 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2415 {
2416   PetscFunctionBegin;
2417   PetscCall(MatCreate(comm, A));
2418   PetscCall(MatSetSizes(*A, m, n, m, n));
2419   PetscCall(MatSetType(*A, MATSEQSELL));
2420   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2421   PetscFunctionReturn(PETSC_SUCCESS);
2422 }
2423 
2424 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2425 {
2426   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2427   PetscInt     totalslices = a->totalslices;
2428 
2429   PetscFunctionBegin;
2430   /* If the  matrix dimensions are not equal,or no of nonzeros */
2431   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2432     *flg = PETSC_FALSE;
2433     PetscFunctionReturn(PETSC_SUCCESS);
2434   }
2435   /* if the a->colidx are the same */
2436   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2437   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2438   /* if a->val are the same */
2439   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2440   PetscFunctionReturn(PETSC_SUCCESS);
2441 }
2442 
2443 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2444 {
2445   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2446 
2447   PetscFunctionBegin;
2448   a->idiagvalid = PETSC_FALSE;
2449   PetscFunctionReturn(PETSC_SUCCESS);
2450 }
2451 
2452 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2453 {
2454 #if defined(PETSC_USE_COMPLEX)
2455   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2456   PetscInt     i;
2457   PetscScalar *val = a->val;
2458 
2459   PetscFunctionBegin;
2460   for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); }
2461   #if defined(PETSC_HAVE_CUPM)
2462   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2463   #endif
2464 #else
2465   PetscFunctionBegin;
2466 #endif
2467   PetscFunctionReturn(PETSC_SUCCESS);
2468 }
2469