xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 8e010fa18d00c25e58c23165efbc8aef665b8a2c)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited      = PETSC_FALSE;
10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14                                " year = 2018\n"
15                                "}\n";
16 
17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18 
19   #include <immintrin.h>
20 
21   #if !defined(_MM_SCALE_8)
22     #define _MM_SCALE_8 8
23   #endif
24 
25   #if defined(__AVX512F__)
26     /* these do not work
27    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29   */
30     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
33       vec_vals = _mm512_loadu_pd(aval); \
34       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36   #elif defined(__AVX2__) && defined(__FMA__)
37     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38       vec_vals = _mm256_loadu_pd(aval); \
39       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42   #endif
43 #endif /* PETSC_HAVE_IMMINTRIN_H */
44 
45 /*@C
46   MatSeqSELLSetPreallocation - For good matrix assembly performance
47   the user should preallocate the matrix storage by setting the parameter `nz`
48   (or the array `nnz`).
49 
50   Collective
51 
52   Input Parameters:
53 + B       - The `MATSEQSELL` matrix
54 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
55 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
56 
57   Level: intermediate
58 
59   Notes:
60   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
61   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
62   allocation.
63 
64   You can call `MatGetInfo()` to get information on how effective the preallocation was;
65   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
66   You can also run with the option `-info` and look for messages with the string
67   malloc in them to see if additional memory allocation was needed.
68 
69   Developer Notes:
70   Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
71   entries or columns indices.
72 
73   The maximum number of nonzeos in any row should be as accurate as possible.
74   If it is underestimated, you will get bad performance due to reallocation
75   (`MatSeqXSELLReallocateSELL()`).
76 
77 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
78  @*/
79 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
80 {
81   PetscFunctionBegin;
82   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
83   PetscValidType(B, 1);
84   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
85   PetscFunctionReturn(PETSC_SUCCESS);
86 }
87 
88 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
89 {
90   Mat_SeqSELL *b;
91   PetscInt     i, j, totalslices;
92 #if defined(PETSC_HAVE_CUDA)
93   PetscInt rlenmax = 0;
94 #endif
95   PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
96 
97   PetscFunctionBegin;
98   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
99   if (maxallocrow == MAT_SKIP_ALLOCATION) {
100     skipallocation = PETSC_TRUE;
101     maxallocrow    = 0;
102   }
103 
104   PetscCall(PetscLayoutSetUp(B->rmap));
105   PetscCall(PetscLayoutSetUp(B->cmap));
106 
107   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
108   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
109   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
110   if (rlen) {
111     for (i = 0; i < B->rmap->n; i++) {
112       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
113       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
114     }
115   }
116 
117   B->preallocated = PETSC_TRUE;
118 
119   b = (Mat_SeqSELL *)B->data;
120 
121   if (!b->sliceheight) { /* not set yet */
122 #if defined(PETSC_HAVE_CUDA)
123     b->sliceheight = 16;
124 #else
125     b->sliceheight = 8;
126 #endif
127   }
128   totalslices    = PetscCeilInt(B->rmap->n, b->sliceheight);
129   b->totalslices = totalslices;
130   if (!skipallocation) {
131     if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
132 
133     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
134       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
135     }
136     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
137       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
138       else if (maxallocrow < 0) maxallocrow = 1;
139 #if defined(PETSC_HAVE_CUDA)
140       rlenmax = maxallocrow;
141       /* Pad the slice to DEVICE_MEM_ALIGN */
142       while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
143 #endif
144       for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
145     } else {
146 #if defined(PETSC_HAVE_CUDA)
147       PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
148 #endif
149       maxallocrow  = 0;
150       b->sliidx[0] = 0;
151       for (i = 1; i < totalslices; i++) {
152         b->sliidx[i] = 0;
153         for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); }
154 #if defined(PETSC_HAVE_CUDA)
155         rlenmax = PetscMax(b->sliidx[i], rlenmax);
156         /* Pad the slice to DEVICE_MEM_ALIGN */
157         b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
158 #endif
159         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
160         PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
161       }
162       /* last slice */
163       b->sliidx[totalslices] = 0;
164       for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
165 #if defined(PETSC_HAVE_CUDA)
166       rlenmax                = PetscMax(b->sliidx[i], rlenmax);
167       b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
168 #endif
169       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
170       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
171     }
172 
173     /* allocate space for val, colidx, rlen */
174     /* FIXME: should B's old memory be unlogged? */
175     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
176     /* FIXME: assuming an element of the bit array takes 8 bits */
177     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
178     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
179     PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
180 
181     b->singlemalloc = PETSC_TRUE;
182     b->free_val     = PETSC_TRUE;
183     b->free_colidx  = PETSC_TRUE;
184   } else {
185     b->free_val    = PETSC_FALSE;
186     b->free_colidx = PETSC_FALSE;
187   }
188 
189   b->nz          = 0;
190   b->maxallocrow = maxallocrow;
191 #if defined(PETSC_HAVE_CUDA)
192   b->rlenmax = rlenmax;
193 #else
194   b->rlenmax = maxallocrow;
195 #endif
196   b->maxallocmat      = b->sliidx[totalslices];
197   B->info.nz_unneeded = (double)b->maxallocmat;
198   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
199   PetscFunctionReturn(PETSC_SUCCESS);
200 }
201 
202 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
203 {
204   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
205   PetscInt     shift;
206 
207   PetscFunctionBegin;
208   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
209   if (nz) *nz = a->rlen[row];
210   shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
211   if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); }
212   if (idx) {
213     PetscInt j;
214     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
215     *idx = a->getrowcols;
216   }
217   if (v) {
218     PetscInt j;
219     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
220     *v = a->getrowvals;
221   }
222   PetscFunctionReturn(PETSC_SUCCESS);
223 }
224 
225 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
226 {
227   PetscFunctionBegin;
228   PetscFunctionReturn(PETSC_SUCCESS);
229 }
230 
231 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
232 {
233   Mat          B;
234   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
235   PetscInt     i;
236 
237   PetscFunctionBegin;
238   if (reuse == MAT_REUSE_MATRIX) {
239     B = *newmat;
240     PetscCall(MatZeroEntries(B));
241   } else {
242     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
243     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
244     PetscCall(MatSetType(B, MATSEQAIJ));
245     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
246   }
247 
248   for (i = 0; i < A->rmap->n; i++) {
249     PetscInt     nz = 0, *cols = NULL;
250     PetscScalar *vals = NULL;
251 
252     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
253     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
254     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
255   }
256 
257   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
258   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
259   B->rmap->bs = A->rmap->bs;
260 
261   if (reuse == MAT_INPLACE_MATRIX) {
262     PetscCall(MatHeaderReplace(A, &B));
263   } else {
264     *newmat = B;
265   }
266   PetscFunctionReturn(PETSC_SUCCESS);
267 }
268 
269 #include <../src/mat/impls/aij/seq/aij.h>
270 
271 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
272 {
273   Mat                B;
274   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
275   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
276   const PetscInt    *cols;
277   const PetscScalar *vals;
278 
279   PetscFunctionBegin;
280 
281   if (reuse == MAT_REUSE_MATRIX) {
282     B = *newmat;
283   } else {
284     if (PetscDefined(USE_DEBUG) || !a->ilen) {
285       PetscCall(PetscMalloc1(m, &rowlengths));
286       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
287     }
288     if (PetscDefined(USE_DEBUG) && a->ilen) {
289       PetscBool eq;
290       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
291       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
292       PetscCall(PetscFree(rowlengths));
293       rowlengths = a->ilen;
294     } else if (a->ilen) rowlengths = a->ilen;
295     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
296     PetscCall(MatSetSizes(B, m, n, m, n));
297     PetscCall(MatSetType(B, MATSEQSELL));
298     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
299     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
300   }
301 
302   for (row = 0; row < m; row++) {
303     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
304     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
305     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
306   }
307   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
308   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
309   B->rmap->bs = A->rmap->bs;
310 
311   if (reuse == MAT_INPLACE_MATRIX) {
312     PetscCall(MatHeaderReplace(A, &B));
313   } else {
314     *newmat = B;
315   }
316   PetscFunctionReturn(PETSC_SUCCESS);
317 }
318 
319 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
320 {
321   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
322   PetscScalar       *y;
323   const PetscScalar *x;
324   const MatScalar   *aval        = a->val;
325   PetscInt           totalslices = a->totalslices;
326   const PetscInt    *acolidx     = a->colidx;
327   PetscInt           i, j;
328 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
329   __m512d  vec_x, vec_y, vec_vals;
330   __m256i  vec_idx;
331   __mmask8 mask;
332   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
333   __m256i  vec_idx2, vec_idx3, vec_idx4;
334 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
335   __m128i   vec_idx;
336   __m256d   vec_x, vec_y, vec_y2, vec_vals;
337   MatScalar yval;
338   PetscInt  r, rows_left, row, nnz_in_row;
339 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
340   __m128d   vec_x_tmp;
341   __m256d   vec_x, vec_y, vec_y2, vec_vals;
342   MatScalar yval;
343   PetscInt  r, rows_left, row, nnz_in_row;
344 #else
345   PetscInt     k, sliceheight = a->sliceheight;
346   PetscScalar *sum;
347 #endif
348 
349 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
350   #pragma disjoint(*x, *y, *aval)
351 #endif
352 
353   PetscFunctionBegin;
354   PetscCall(VecGetArrayRead(xx, &x));
355   PetscCall(VecGetArray(yy, &y));
356 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
357   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
358   for (i = 0; i < totalslices; i++) { /* loop over slices */
359     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
360     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
361 
362     vec_y  = _mm512_setzero_pd();
363     vec_y2 = _mm512_setzero_pd();
364     vec_y3 = _mm512_setzero_pd();
365     vec_y4 = _mm512_setzero_pd();
366 
367     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
368     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
369     case 3:
370       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
371       acolidx += 8;
372       aval += 8;
373       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
374       acolidx += 8;
375       aval += 8;
376       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
377       acolidx += 8;
378       aval += 8;
379       j += 3;
380       break;
381     case 2:
382       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
383       acolidx += 8;
384       aval += 8;
385       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
386       acolidx += 8;
387       aval += 8;
388       j += 2;
389       break;
390     case 1:
391       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
392       acolidx += 8;
393       aval += 8;
394       j += 1;
395       break;
396     }
397   #pragma novector
398     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
399       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
400       acolidx += 8;
401       aval += 8;
402       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
403       acolidx += 8;
404       aval += 8;
405       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
406       acolidx += 8;
407       aval += 8;
408       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
409       acolidx += 8;
410       aval += 8;
411     }
412 
413     vec_y = _mm512_add_pd(vec_y, vec_y2);
414     vec_y = _mm512_add_pd(vec_y, vec_y3);
415     vec_y = _mm512_add_pd(vec_y, vec_y4);
416     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
417       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
418       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
419     } else {
420       _mm512_storeu_pd(&y[8 * i], vec_y);
421     }
422   }
423 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
424   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
425   for (i = 0; i < totalslices; i++) { /* loop over full slices */
426     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
427     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
428 
429     /* last slice may have padding rows. Don't use vectorization. */
430     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
431       rows_left = A->rmap->n - 8 * i;
432       for (r = 0; r < rows_left; ++r) {
433         yval       = (MatScalar)0;
434         row        = 8 * i + r;
435         nnz_in_row = a->rlen[row];
436         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
437         y[row] = yval;
438       }
439       break;
440     }
441 
442     vec_y  = _mm256_setzero_pd();
443     vec_y2 = _mm256_setzero_pd();
444 
445   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
446   #pragma novector
447   #pragma unroll(2)
448     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
449       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
450       aval += 4;
451       acolidx += 4;
452       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
453       aval += 4;
454       acolidx += 4;
455     }
456 
457     _mm256_storeu_pd(y + i * 8, vec_y);
458     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
459   }
460 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
461   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
462   for (i = 0; i < totalslices; i++) { /* loop over full slices */
463     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
464     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
465 
466     vec_y  = _mm256_setzero_pd();
467     vec_y2 = _mm256_setzero_pd();
468 
469     /* last slice may have padding rows. Don't use vectorization. */
470     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
471       rows_left = A->rmap->n - 8 * i;
472       for (r = 0; r < rows_left; ++r) {
473         yval       = (MatScalar)0;
474         row        = 8 * i + r;
475         nnz_in_row = a->rlen[row];
476         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
477         y[row] = yval;
478       }
479       break;
480     }
481 
482   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
483   #pragma novector
484   #pragma unroll(2)
485     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
486       vec_vals  = _mm256_loadu_pd(aval);
487       vec_x_tmp = _mm_setzero_pd();
488       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
489       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
490       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
491       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
492       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
493       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
494       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
495       aval += 4;
496 
497       vec_vals  = _mm256_loadu_pd(aval);
498       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
499       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
500       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
501       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
502       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
503       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
504       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
505       aval += 4;
506     }
507 
508     _mm256_storeu_pd(y + i * 8, vec_y);
509     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
510   }
511 #else
512   PetscCall(PetscMalloc1(sliceheight, &sum));
513   for (i = 0; i < totalslices; i++) { /* loop over slices */
514     for (j = 0; j < sliceheight; j++) {
515       sum[j] = 0.0;
516       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
517     }
518     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
519       for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
520     } else {
521       for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
522     }
523   }
524   PetscCall(PetscFree(sum));
525 #endif
526 
527   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
528   PetscCall(VecRestoreArrayRead(xx, &x));
529   PetscCall(VecRestoreArray(yy, &y));
530   PetscFunctionReturn(PETSC_SUCCESS);
531 }
532 
533 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
534 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
535 {
536   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
537   PetscScalar       *y, *z;
538   const PetscScalar *x;
539   const MatScalar   *aval        = a->val;
540   PetscInt           totalslices = a->totalslices;
541   const PetscInt    *acolidx     = a->colidx;
542   PetscInt           i, j;
543 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
544   __m512d  vec_x, vec_y, vec_vals;
545   __m256i  vec_idx;
546   __mmask8 mask = 0;
547   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
548   __m256i  vec_idx2, vec_idx3, vec_idx4;
549 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
550   __m128d   vec_x_tmp;
551   __m256d   vec_x, vec_y, vec_y2, vec_vals;
552   MatScalar yval;
553   PetscInt  r, row, nnz_in_row;
554 #else
555   PetscInt     k, sliceheight = a->sliceheight;
556   PetscScalar *sum;
557 #endif
558 
559 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
560   #pragma disjoint(*x, *y, *aval)
561 #endif
562 
563   PetscFunctionBegin;
564   if (!a->nz) {
565     PetscCall(VecCopy(yy, zz));
566     PetscFunctionReturn(PETSC_SUCCESS);
567   }
568   PetscCall(VecGetArrayRead(xx, &x));
569   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
570 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
571   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
572   for (i = 0; i < totalslices; i++) { /* loop over slices */
573     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
574     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
575 
576     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
577       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
578       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
579     } else {
580       vec_y = _mm512_loadu_pd(&y[8 * i]);
581     }
582     vec_y2 = _mm512_setzero_pd();
583     vec_y3 = _mm512_setzero_pd();
584     vec_y4 = _mm512_setzero_pd();
585 
586     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
587     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
588     case 3:
589       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
590       acolidx += 8;
591       aval += 8;
592       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
593       acolidx += 8;
594       aval += 8;
595       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
596       acolidx += 8;
597       aval += 8;
598       j += 3;
599       break;
600     case 2:
601       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
602       acolidx += 8;
603       aval += 8;
604       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
605       acolidx += 8;
606       aval += 8;
607       j += 2;
608       break;
609     case 1:
610       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
611       acolidx += 8;
612       aval += 8;
613       j += 1;
614       break;
615     }
616   #pragma novector
617     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
618       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
619       acolidx += 8;
620       aval += 8;
621       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
622       acolidx += 8;
623       aval += 8;
624       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
625       acolidx += 8;
626       aval += 8;
627       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
628       acolidx += 8;
629       aval += 8;
630     }
631 
632     vec_y = _mm512_add_pd(vec_y, vec_y2);
633     vec_y = _mm512_add_pd(vec_y, vec_y3);
634     vec_y = _mm512_add_pd(vec_y, vec_y4);
635     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
636       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
637     } else {
638       _mm512_storeu_pd(&z[8 * i], vec_y);
639     }
640   }
641 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
642   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
643   for (i = 0; i < totalslices; i++) { /* loop over full slices */
644     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
645     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
646 
647     /* last slice may have padding rows. Don't use vectorization. */
648     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
649       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
650         row        = 8 * i + r;
651         yval       = (MatScalar)0.0;
652         nnz_in_row = a->rlen[row];
653         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
654         z[row] = y[row] + yval;
655       }
656       break;
657     }
658 
659     vec_y  = _mm256_loadu_pd(y + 8 * i);
660     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
661 
662     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
663     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
664       vec_vals  = _mm256_loadu_pd(aval);
665       vec_x_tmp = _mm_setzero_pd();
666       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
667       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
668       vec_x     = _mm256_setzero_pd();
669       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
670       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
671       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
672       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
673       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
674       aval += 4;
675 
676       vec_vals  = _mm256_loadu_pd(aval);
677       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
678       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
679       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
680       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
681       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
682       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
683       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
684       aval += 4;
685     }
686 
687     _mm256_storeu_pd(z + i * 8, vec_y);
688     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
689   }
690 #else
691   PetscCall(PetscMalloc1(sliceheight, &sum));
692   for (i = 0; i < totalslices; i++) { /* loop over slices */
693     for (j = 0; j < sliceheight; j++) {
694       sum[j] = 0.0;
695       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
696     }
697     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
698       for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
699     } else {
700       for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
701     }
702   }
703   PetscCall(PetscFree(sum));
704 #endif
705 
706   PetscCall(PetscLogFlops(2.0 * a->nz));
707   PetscCall(VecRestoreArrayRead(xx, &x));
708   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
709   PetscFunctionReturn(PETSC_SUCCESS);
710 }
711 
712 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
713 {
714   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
715   PetscScalar       *y;
716   const PetscScalar *x;
717   const MatScalar   *aval    = a->val;
718   const PetscInt    *acolidx = a->colidx;
719   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
720 
721 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
722   #pragma disjoint(*x, *y, *aval)
723 #endif
724 
725   PetscFunctionBegin;
726   if (A->symmetric == PETSC_BOOL3_TRUE) {
727     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
728     PetscFunctionReturn(PETSC_SUCCESS);
729   }
730   if (zz != yy) PetscCall(VecCopy(zz, yy));
731 
732   if (a->nz) {
733     PetscCall(VecGetArrayRead(xx, &x));
734     PetscCall(VecGetArray(yy, &y));
735     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
736       if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
737         for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
738           row        = sliceheight * i + r;
739           nnz_in_row = a->rlen[row];
740           for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
741         }
742         break;
743       }
744       for (r = 0; r < sliceheight; ++r)
745         for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
746     }
747     PetscCall(PetscLogFlops(2.0 * a->nz));
748     PetscCall(VecRestoreArrayRead(xx, &x));
749     PetscCall(VecRestoreArray(yy, &y));
750   }
751   PetscFunctionReturn(PETSC_SUCCESS);
752 }
753 
754 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
755 {
756   PetscFunctionBegin;
757   if (A->symmetric == PETSC_BOOL3_TRUE) {
758     PetscCall(MatMult_SeqSELL(A, xx, yy));
759   } else {
760     PetscCall(VecSet(yy, 0.0));
761     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
762   }
763   PetscFunctionReturn(PETSC_SUCCESS);
764 }
765 
766 /*
767      Checks for missing diagonals
768 */
769 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
770 {
771   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
772   PetscInt    *diag, i;
773 
774   PetscFunctionBegin;
775   *missing = PETSC_FALSE;
776   if (A->rmap->n > 0 && !(a->colidx)) {
777     *missing = PETSC_TRUE;
778     if (d) *d = 0;
779     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
780   } else {
781     diag = a->diag;
782     for (i = 0; i < A->rmap->n; i++) {
783       if (diag[i] == -1) {
784         *missing = PETSC_TRUE;
785         if (d) *d = i;
786         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
787         break;
788       }
789     }
790   }
791   PetscFunctionReturn(PETSC_SUCCESS);
792 }
793 
794 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
795 {
796   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
797   PetscInt     i, j, m = A->rmap->n, shift;
798 
799   PetscFunctionBegin;
800   if (!a->diag) {
801     PetscCall(PetscMalloc1(m, &a->diag));
802     a->free_diag = PETSC_TRUE;
803   }
804   for (i = 0; i < m; i++) {                                          /* loop over rows */
805     shift      = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
806     a->diag[i] = -1;
807     for (j = 0; j < a->rlen[i]; j++) {
808       if (a->colidx[shift + a->sliceheight * j] == i) {
809         a->diag[i] = shift + a->sliceheight * j;
810         break;
811       }
812     }
813   }
814   PetscFunctionReturn(PETSC_SUCCESS);
815 }
816 
817 /*
818   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
819 */
820 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
821 {
822   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
823   PetscInt     i, *diag, m = A->rmap->n;
824   MatScalar   *val = a->val;
825   PetscScalar *idiag, *mdiag;
826 
827   PetscFunctionBegin;
828   if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
829   PetscCall(MatMarkDiagonal_SeqSELL(A));
830   diag = a->diag;
831   if (!a->idiag) {
832     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
833     val = a->val;
834   }
835   mdiag = a->mdiag;
836   idiag = a->idiag;
837 
838   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
839     for (i = 0; i < m; i++) {
840       mdiag[i] = val[diag[i]];
841       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
842         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
843         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
844         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
845         A->factorerror_zeropivot_value = 0.0;
846         A->factorerror_zeropivot_row   = i;
847       }
848       idiag[i] = 1.0 / val[diag[i]];
849     }
850     PetscCall(PetscLogFlops(m));
851   } else {
852     for (i = 0; i < m; i++) {
853       mdiag[i] = val[diag[i]];
854       idiag[i] = omega / (fshift + val[diag[i]]);
855     }
856     PetscCall(PetscLogFlops(2.0 * m));
857   }
858   a->idiagvalid = PETSC_TRUE;
859   PetscFunctionReturn(PETSC_SUCCESS);
860 }
861 
862 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
863 {
864   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
865 
866   PetscFunctionBegin;
867   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
868   PetscCall(MatSeqSELLInvalidateDiagonal(A));
869   PetscFunctionReturn(PETSC_SUCCESS);
870 }
871 
872 PetscErrorCode MatDestroy_SeqSELL(Mat A)
873 {
874   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
875 
876   PetscFunctionBegin;
877   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
878   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
879   PetscCall(ISDestroy(&a->row));
880   PetscCall(ISDestroy(&a->col));
881   PetscCall(PetscFree(a->diag));
882   PetscCall(PetscFree(a->rlen));
883   PetscCall(PetscFree(a->sliidx));
884   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
885   PetscCall(PetscFree(a->solve_work));
886   PetscCall(ISDestroy(&a->icol));
887   PetscCall(PetscFree(a->saved_values));
888   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
889   PetscCall(PetscFree(A->data));
890 #if defined(PETSC_HAVE_CUDA)
891   PetscCall(PetscFree(a->chunk_slice_map));
892 #endif
893 
894   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
895   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
896   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
897   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
898   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
899   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
900   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
901 #if defined(PETSC_HAVE_CUDA)
902   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL));
903 #endif
904   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
905   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
906   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
907   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL));
908   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
909   PetscFunctionReturn(PETSC_SUCCESS);
910 }
911 
912 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
913 {
914   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
915 
916   PetscFunctionBegin;
917   switch (op) {
918   case MAT_ROW_ORIENTED:
919     a->roworiented = flg;
920     break;
921   case MAT_KEEP_NONZERO_PATTERN:
922     a->keepnonzeropattern = flg;
923     break;
924   case MAT_NEW_NONZERO_LOCATIONS:
925     a->nonew = (flg ? 0 : 1);
926     break;
927   case MAT_NEW_NONZERO_LOCATION_ERR:
928     a->nonew = (flg ? -1 : 0);
929     break;
930   case MAT_NEW_NONZERO_ALLOCATION_ERR:
931     a->nonew = (flg ? -2 : 0);
932     break;
933   case MAT_UNUSED_NONZERO_LOCATION_ERR:
934     a->nounused = (flg ? -1 : 0);
935     break;
936   case MAT_FORCE_DIAGONAL_ENTRIES:
937   case MAT_IGNORE_OFF_PROC_ENTRIES:
938   case MAT_USE_HASH_TABLE:
939   case MAT_SORTED_FULL:
940     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
941     break;
942   case MAT_SPD:
943   case MAT_SYMMETRIC:
944   case MAT_STRUCTURALLY_SYMMETRIC:
945   case MAT_HERMITIAN:
946   case MAT_SYMMETRY_ETERNAL:
947   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
948   case MAT_SPD_ETERNAL:
949     /* These options are handled directly by MatSetOption() */
950     break;
951   default:
952     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
953   }
954   PetscFunctionReturn(PETSC_SUCCESS);
955 }
956 
957 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
958 {
959   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
960   PetscInt     i, j, n, shift;
961   PetscScalar *x, zero = 0.0;
962 
963   PetscFunctionBegin;
964   PetscCall(VecGetLocalSize(v, &n));
965   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
966 
967   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
968     PetscInt *diag = a->diag;
969     PetscCall(VecGetArray(v, &x));
970     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
971     PetscCall(VecRestoreArray(v, &x));
972     PetscFunctionReturn(PETSC_SUCCESS);
973   }
974 
975   PetscCall(VecSet(v, zero));
976   PetscCall(VecGetArray(v, &x));
977   for (i = 0; i < n; i++) {                                     /* loop over rows */
978     shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
979     x[i]  = 0;
980     for (j = 0; j < a->rlen[i]; j++) {
981       if (a->colidx[shift + a->sliceheight * j] == i) {
982         x[i] = a->val[shift + a->sliceheight * j];
983         break;
984       }
985     }
986   }
987   PetscCall(VecRestoreArray(v, &x));
988   PetscFunctionReturn(PETSC_SUCCESS);
989 }
990 
991 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
992 {
993   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
994   const PetscScalar *l, *r;
995   PetscInt           i, j, m, n, row;
996 
997   PetscFunctionBegin;
998   if (ll) {
999     /* The local size is used so that VecMPI can be passed to this routine
1000        by MatDiagonalScale_MPISELL */
1001     PetscCall(VecGetLocalSize(ll, &m));
1002     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1003     PetscCall(VecGetArrayRead(ll, &l));
1004     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1005       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1006         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1007           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
1008         }
1009       } else {
1010         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; }
1011       }
1012     }
1013     PetscCall(VecRestoreArrayRead(ll, &l));
1014     PetscCall(PetscLogFlops(a->nz));
1015   }
1016   if (rr) {
1017     PetscCall(VecGetLocalSize(rr, &n));
1018     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1019     PetscCall(VecGetArrayRead(rr, &r));
1020     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1021       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1022         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1023           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1024         }
1025       } else {
1026         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1027       }
1028     }
1029     PetscCall(VecRestoreArrayRead(rr, &r));
1030     PetscCall(PetscLogFlops(a->nz));
1031   }
1032   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1033 #if defined(PETSC_HAVE_CUDA)
1034   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1035 #endif
1036   PetscFunctionReturn(PETSC_SUCCESS);
1037 }
1038 
1039 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1040 {
1041   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1042   PetscInt    *cp, i, k, low, high, t, row, col, l;
1043   PetscInt     shift;
1044   MatScalar   *vp;
1045 
1046   PetscFunctionBegin;
1047   for (k = 0; k < m; k++) { /* loop over requested rows */
1048     row = im[k];
1049     if (row < 0) continue;
1050     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1051     shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1052     cp    = a->colidx + shift;                                        /* pointer to the row */
1053     vp    = a->val + shift;                                           /* pointer to the row */
1054     for (l = 0; l < n; l++) {                                         /* loop over requested columns */
1055       col = in[l];
1056       if (col < 0) continue;
1057       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1058       high = a->rlen[row];
1059       low  = 0; /* assume unsorted */
1060       while (high - low > 5) {
1061         t = (low + high) / 2;
1062         if (*(cp + a->sliceheight * t) > col) high = t;
1063         else low = t;
1064       }
1065       for (i = low; i < high; i++) {
1066         if (*(cp + a->sliceheight * i) > col) break;
1067         if (*(cp + a->sliceheight * i) == col) {
1068           *v++ = *(vp + a->sliceheight * i);
1069           goto finished;
1070         }
1071       }
1072       *v++ = 0.0;
1073     finished:;
1074     }
1075   }
1076   PetscFunctionReturn(PETSC_SUCCESS);
1077 }
1078 
1079 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1080 {
1081   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1082   PetscInt          i, j, m = A->rmap->n, shift;
1083   const char       *name;
1084   PetscViewerFormat format;
1085 
1086   PetscFunctionBegin;
1087   PetscCall(PetscViewerGetFormat(viewer, &format));
1088   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1089     PetscInt nofinalvalue = 0;
1090     /*
1091     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1092       nofinalvalue = 1;
1093     }
1094     */
1095     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1096     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1097     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1098 #if defined(PETSC_USE_COMPLEX)
1099     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1100 #else
1101     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1102 #endif
1103     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1104 
1105     for (i = 0; i < m; i++) {
1106       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1107       for (j = 0; j < a->rlen[i]; j++) {
1108 #if defined(PETSC_USE_COMPLEX)
1109         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1110 #else
1111         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1112 #endif
1113       }
1114     }
1115     /*
1116     if (nofinalvalue) {
1117 #if defined(PETSC_USE_COMPLEX)
1118       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1119 #else
1120       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1121 #endif
1122     }
1123     */
1124     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1125     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1126     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1127   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1128     PetscFunctionReturn(PETSC_SUCCESS);
1129   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1130     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1131     for (i = 0; i < m; i++) {
1132       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1133       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1134       for (j = 0; j < a->rlen[i]; j++) {
1135 #if defined(PETSC_USE_COMPLEX)
1136         if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1137           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1138         } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1139           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1140         } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1141           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1142         }
1143 #else
1144         if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1145 #endif
1146       }
1147       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1148     }
1149     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1150   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1151     PetscInt    cnt = 0, jcnt;
1152     PetscScalar value;
1153 #if defined(PETSC_USE_COMPLEX)
1154     PetscBool realonly = PETSC_TRUE;
1155     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1156       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1157         realonly = PETSC_FALSE;
1158         break;
1159       }
1160     }
1161 #endif
1162 
1163     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1164     for (i = 0; i < m; i++) {
1165       jcnt  = 0;
1166       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1167       for (j = 0; j < A->cmap->n; j++) {
1168         if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1169           value = a->val[cnt++];
1170           jcnt++;
1171         } else {
1172           value = 0.0;
1173         }
1174 #if defined(PETSC_USE_COMPLEX)
1175         if (realonly) {
1176           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1177         } else {
1178           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1179         }
1180 #else
1181         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1182 #endif
1183       }
1184       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1185     }
1186     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1187   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1188     PetscInt fshift = 1;
1189     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1190 #if defined(PETSC_USE_COMPLEX)
1191     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1192 #else
1193     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1194 #endif
1195     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1196     for (i = 0; i < m; i++) {
1197       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1198       for (j = 0; j < a->rlen[i]; j++) {
1199 #if defined(PETSC_USE_COMPLEX)
1200         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1201 #else
1202         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1203 #endif
1204       }
1205     }
1206     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1207   } else if (format == PETSC_VIEWER_NATIVE) {
1208     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1209       PetscInt row;
1210       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1211       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1212 #if defined(PETSC_USE_COMPLEX)
1213         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1214           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1215         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1216           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1217         } else {
1218           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1219         }
1220 #else
1221         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1222 #endif
1223       }
1224     }
1225   } else {
1226     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1227     if (A->factortype) {
1228       for (i = 0; i < m; i++) {
1229         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1230         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1231         /* L part */
1232         for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1233 #if defined(PETSC_USE_COMPLEX)
1234           if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1235             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1236           } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1237             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1238           } else {
1239             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1240           }
1241 #else
1242           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1243 #endif
1244         }
1245         /* diagonal */
1246         j = a->diag[i];
1247 #if defined(PETSC_USE_COMPLEX)
1248         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1249           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1250         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1251           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1252         } else {
1253           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1254         }
1255 #else
1256         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1257 #endif
1258 
1259         /* U part */
1260         for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1261 #if defined(PETSC_USE_COMPLEX)
1262           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1263             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1264           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1265             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1266           } else {
1267             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1268           }
1269 #else
1270           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1271 #endif
1272         }
1273         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1274       }
1275     } else {
1276       for (i = 0; i < m; i++) {
1277         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1278         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1279         for (j = 0; j < a->rlen[i]; j++) {
1280 #if defined(PETSC_USE_COMPLEX)
1281           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1282             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1283           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1284             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1285           } else {
1286             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1287           }
1288 #else
1289           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1290 #endif
1291         }
1292         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1293       }
1294     }
1295     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1296   }
1297   PetscCall(PetscViewerFlush(viewer));
1298   PetscFunctionReturn(PETSC_SUCCESS);
1299 }
1300 
1301 #include <petscdraw.h>
1302 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1303 {
1304   Mat               A = (Mat)Aa;
1305   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1306   PetscInt          i, j, m = A->rmap->n, shift;
1307   int               color;
1308   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1309   PetscViewer       viewer;
1310   PetscViewerFormat format;
1311 
1312   PetscFunctionBegin;
1313   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1314   PetscCall(PetscViewerGetFormat(viewer, &format));
1315   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1316 
1317   /* loop over matrix elements drawing boxes */
1318 
1319   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1320     PetscDrawCollectiveBegin(draw);
1321     /* Blue for negative, Cyan for zero and  Red for positive */
1322     color = PETSC_DRAW_BLUE;
1323     for (i = 0; i < m; i++) {
1324       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1325       y_l   = m - i - 1.0;
1326       y_r   = y_l + 1.0;
1327       for (j = 0; j < a->rlen[i]; j++) {
1328         x_l = a->colidx[shift + a->sliceheight * j];
1329         x_r = x_l + 1.0;
1330         if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1331         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1332       }
1333     }
1334     color = PETSC_DRAW_CYAN;
1335     for (i = 0; i < m; i++) {
1336       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1337       y_l   = m - i - 1.0;
1338       y_r   = y_l + 1.0;
1339       for (j = 0; j < a->rlen[i]; j++) {
1340         x_l = a->colidx[shift + a->sliceheight * j];
1341         x_r = x_l + 1.0;
1342         if (a->val[shift + a->sliceheight * j] != 0.) continue;
1343         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1344       }
1345     }
1346     color = PETSC_DRAW_RED;
1347     for (i = 0; i < m; i++) {
1348       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1349       y_l   = m - i - 1.0;
1350       y_r   = y_l + 1.0;
1351       for (j = 0; j < a->rlen[i]; j++) {
1352         x_l = a->colidx[shift + a->sliceheight * j];
1353         x_r = x_l + 1.0;
1354         if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1355         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1356       }
1357     }
1358     PetscDrawCollectiveEnd(draw);
1359   } else {
1360     /* use contour shading to indicate magnitude of values */
1361     /* first determine max of all nonzero values */
1362     PetscReal minv = 0.0, maxv = 0.0;
1363     PetscInt  count = 0;
1364     PetscDraw popup;
1365     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1366       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1367     }
1368     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1369     PetscCall(PetscDrawGetPopup(draw, &popup));
1370     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1371 
1372     PetscDrawCollectiveBegin(draw);
1373     for (i = 0; i < m; i++) {
1374       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1375       y_l   = m - i - 1.0;
1376       y_r   = y_l + 1.0;
1377       for (j = 0; j < a->rlen[i]; j++) {
1378         x_l   = a->colidx[shift + a->sliceheight * j];
1379         x_r   = x_l + 1.0;
1380         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1381         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1382         count++;
1383       }
1384     }
1385     PetscDrawCollectiveEnd(draw);
1386   }
1387   PetscFunctionReturn(PETSC_SUCCESS);
1388 }
1389 
1390 #include <petscdraw.h>
1391 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1392 {
1393   PetscDraw draw;
1394   PetscReal xr, yr, xl, yl, h, w;
1395   PetscBool isnull;
1396 
1397   PetscFunctionBegin;
1398   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1399   PetscCall(PetscDrawIsNull(draw, &isnull));
1400   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1401 
1402   xr = A->cmap->n;
1403   yr = A->rmap->n;
1404   h  = yr / 10.0;
1405   w  = xr / 10.0;
1406   xr += w;
1407   yr += h;
1408   xl = -w;
1409   yl = -h;
1410   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1411   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1412   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1413   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1414   PetscCall(PetscDrawSave(draw));
1415   PetscFunctionReturn(PETSC_SUCCESS);
1416 }
1417 
1418 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1419 {
1420   PetscBool iascii, isbinary, isdraw;
1421 
1422   PetscFunctionBegin;
1423   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1424   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1425   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1426   if (iascii) {
1427     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1428   } else if (isbinary) {
1429     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1430   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1431   PetscFunctionReturn(PETSC_SUCCESS);
1432 }
1433 
1434 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1435 {
1436   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1437   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1438   MatScalar   *vp;
1439 #if defined(PETSC_HAVE_CUDA)
1440   PetscInt totalchunks = 0;
1441 #endif
1442 
1443   PetscFunctionBegin;
1444   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1445   /* To do: compress out the unused elements */
1446   PetscCall(MatMarkDiagonal_SeqSELL(A));
1447   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1448   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1449   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1450   a->nonzerorowcnt = 0;
1451   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1452   for (i = 0; i < a->totalslices; ++i) {
1453     shift = a->sliidx[i];                                                   /* starting index of the slice */
1454     cp    = a->colidx + shift;                                              /* pointer to the column indices of the slice */
1455     vp    = a->val + shift;                                                 /* pointer to the nonzero values of the slice */
1456     for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1457       row  = a->sliceheight * i + row_in_slice;
1458       nrow = a->rlen[row]; /* number of nonzeros in row */
1459       /*
1460         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1461         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1462       */
1463       lastcol = 0;
1464       if (nrow > 0) { /* nonempty row */
1465         a->nonzerorowcnt++;
1466         lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1467       } else if (!row_in_slice) {                                 /* first row of the correct slice is empty */
1468         for (j = 1; j < a->sliceheight; j++) {
1469           if (a->rlen[a->sliceheight * i + j]) {
1470             lastcol = cp[j];
1471             break;
1472           }
1473         }
1474       } else {
1475         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1476       }
1477 
1478       for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1479         cp[a->sliceheight * k + row_in_slice] = lastcol;
1480         vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1481       }
1482     }
1483   }
1484 
1485   A->info.mallocs += a->reallocs;
1486   a->reallocs = 0;
1487 
1488   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1489 #if defined(PETSC_HAVE_CUDA)
1490   if (!a->chunksize && a->totalslices) {
1491     a->chunksize = 64;
1492     while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2;
1493     totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize;
1494   }
1495   if (totalchunks != a->totalchunks) {
1496     PetscCall(PetscFree(a->chunk_slice_map));
1497     PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map));
1498     a->totalchunks = totalchunks;
1499   }
1500   j = 0;
1501   for (i = 0; i < totalchunks; i++) {
1502     while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++;
1503     a->chunk_slice_map[i] = j;
1504   }
1505 #endif
1506   PetscFunctionReturn(PETSC_SUCCESS);
1507 }
1508 
1509 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1510 {
1511   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1512 
1513   PetscFunctionBegin;
1514   info->block_size   = 1.0;
1515   info->nz_allocated = a->maxallocmat;
1516   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1517   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1518   info->assemblies   = A->num_ass;
1519   info->mallocs      = A->info.mallocs;
1520   info->memory       = 0; /* REVIEW ME */
1521   if (A->factortype) {
1522     info->fill_ratio_given  = A->info.fill_ratio_given;
1523     info->fill_ratio_needed = A->info.fill_ratio_needed;
1524     info->factor_mallocs    = A->info.factor_mallocs;
1525   } else {
1526     info->fill_ratio_given  = 0;
1527     info->fill_ratio_needed = 0;
1528     info->factor_mallocs    = 0;
1529   }
1530   PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532 
1533 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1534 {
1535   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1536   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1537   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1538   MatScalar   *vp, value;
1539 #if defined(PETSC_HAVE_CUDA)
1540   PetscBool inserted = PETSC_FALSE;
1541   PetscInt  mul      = DEVICE_MEM_ALIGN / a->sliceheight;
1542 #endif
1543 
1544   PetscFunctionBegin;
1545   for (k = 0; k < m; k++) { /* loop over added rows */
1546     row = im[k];
1547     if (row < 0) continue;
1548     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1549     shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1550     cp    = a->colidx + shift;                                      /* pointer to the row */
1551     vp    = a->val + shift;                                         /* pointer to the row */
1552     nrow  = a->rlen[row];
1553     low   = 0;
1554     high  = nrow;
1555 
1556     for (l = 0; l < n; l++) { /* loop over added columns */
1557       col = in[l];
1558       if (col < 0) continue;
1559       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1560       if (a->roworiented) {
1561         value = v[l + k * n];
1562       } else {
1563         value = v[k + l * m];
1564       }
1565       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1566 
1567       /* search in this row for the specified column, i indicates the column to be set */
1568       if (col <= lastcol) low = 0;
1569       else high = nrow;
1570       lastcol = col;
1571       while (high - low > 5) {
1572         t = (low + high) / 2;
1573         if (*(cp + a->sliceheight * t) > col) high = t;
1574         else low = t;
1575       }
1576       for (i = low; i < high; i++) {
1577         if (*(cp + a->sliceheight * i) > col) break;
1578         if (*(cp + a->sliceheight * i) == col) {
1579           if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1580           else *(vp + a->sliceheight * i) = value;
1581 #if defined(PETSC_HAVE_CUDA)
1582           inserted = PETSC_TRUE;
1583 #endif
1584           low = i + 1;
1585           goto noinsert;
1586         }
1587       }
1588       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1589       if (nonew == 1) goto noinsert;
1590       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1591 #if defined(PETSC_HAVE_CUDA)
1592       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1593 #else
1594       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1595       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1596 #endif
1597       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1598       for (ii = nrow - 1; ii >= i; ii--) {
1599         *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1600         *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1601       }
1602       a->rlen[row]++;
1603       *(cp + a->sliceheight * i) = col;
1604       *(vp + a->sliceheight * i) = value;
1605       a->nz++;
1606       A->nonzerostate++;
1607 #if defined(PETSC_HAVE_CUDA)
1608       inserted = PETSC_TRUE;
1609 #endif
1610       low = i + 1;
1611       high++;
1612       nrow++;
1613     noinsert:;
1614     }
1615     a->rlen[row] = nrow;
1616   }
1617 #if defined(PETSC_HAVE_CUDA)
1618   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1619 #endif
1620   PetscFunctionReturn(PETSC_SUCCESS);
1621 }
1622 
1623 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1624 {
1625   PetscFunctionBegin;
1626   /* If the two matrices have the same copy implementation, use fast copy. */
1627   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1628     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1629     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1630 
1631     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1632     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1633   } else {
1634     PetscCall(MatCopy_Basic(A, B, str));
1635   }
1636   PetscFunctionReturn(PETSC_SUCCESS);
1637 }
1638 
1639 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1640 {
1641   PetscFunctionBegin;
1642   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1643   PetscFunctionReturn(PETSC_SUCCESS);
1644 }
1645 
1646 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1647 {
1648   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1649 
1650   PetscFunctionBegin;
1651   *array = a->val;
1652   PetscFunctionReturn(PETSC_SUCCESS);
1653 }
1654 
1655 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1656 {
1657   PetscFunctionBegin;
1658   PetscFunctionReturn(PETSC_SUCCESS);
1659 }
1660 
1661 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1662 {
1663   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1664   MatScalar   *aval   = a->val;
1665   PetscScalar  oalpha = alpha;
1666   PetscBLASInt one    = 1, size;
1667 
1668   PetscFunctionBegin;
1669   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1670   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1671   PetscCall(PetscLogFlops(a->nz));
1672   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1673 #if defined(PETSC_HAVE_CUDA)
1674   if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1675 #endif
1676   PetscFunctionReturn(PETSC_SUCCESS);
1677 }
1678 
1679 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1680 {
1681   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1682 
1683   PetscFunctionBegin;
1684   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1685   PetscCall(MatShift_Basic(Y, a));
1686   PetscFunctionReturn(PETSC_SUCCESS);
1687 }
1688 
1689 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1690 {
1691   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1692   PetscScalar       *x, sum, *t;
1693   const MatScalar   *idiag = NULL, *mdiag;
1694   const PetscScalar *b, *xb;
1695   PetscInt           n, m = A->rmap->n, i, j, shift;
1696   const PetscInt    *diag;
1697 
1698   PetscFunctionBegin;
1699   its = its * lits;
1700 
1701   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1702   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1703   a->fshift = fshift;
1704   a->omega  = omega;
1705 
1706   diag  = a->diag;
1707   t     = a->ssor_work;
1708   idiag = a->idiag;
1709   mdiag = a->mdiag;
1710 
1711   PetscCall(VecGetArray(xx, &x));
1712   PetscCall(VecGetArrayRead(bb, &b));
1713   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1714   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1715   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1716   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1717 
1718   if (flag & SOR_ZERO_INITIAL_GUESS) {
1719     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1720       for (i = 0; i < m; i++) {
1721         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1722         sum   = b[i];
1723         n     = (diag[i] - shift) / a->sliceheight;
1724         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1725         t[i] = sum;
1726         x[i] = sum * idiag[i];
1727       }
1728       xb = t;
1729       PetscCall(PetscLogFlops(a->nz));
1730     } else xb = b;
1731     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1732       for (i = m - 1; i >= 0; i--) {
1733         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1734         sum   = xb[i];
1735         n     = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1736         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1737         if (xb == b) {
1738           x[i] = sum * idiag[i];
1739         } else {
1740           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1741         }
1742       }
1743       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1744     }
1745     its--;
1746   }
1747   while (its--) {
1748     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1749       for (i = 0; i < m; i++) {
1750         /* lower */
1751         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1752         sum   = b[i];
1753         n     = (diag[i] - shift) / a->sliceheight;
1754         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1755         t[i] = sum; /* save application of the lower-triangular part */
1756         /* upper */
1757         n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1758         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1759         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1760       }
1761       xb = t;
1762       PetscCall(PetscLogFlops(2.0 * a->nz));
1763     } else xb = b;
1764     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1765       for (i = m - 1; i >= 0; i--) {
1766         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1767         sum   = xb[i];
1768         if (xb == b) {
1769           /* whole matrix (no checkpointing available) */
1770           n = a->rlen[i];
1771           for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1772           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1773         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1774           n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1775           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1776           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1777         }
1778       }
1779       if (xb == b) {
1780         PetscCall(PetscLogFlops(2.0 * a->nz));
1781       } else {
1782         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1783       }
1784     }
1785   }
1786   PetscCall(VecRestoreArray(xx, &x));
1787   PetscCall(VecRestoreArrayRead(bb, &b));
1788   PetscFunctionReturn(PETSC_SUCCESS);
1789 }
1790 
1791 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1792                                        MatGetRow_SeqSELL,
1793                                        MatRestoreRow_SeqSELL,
1794                                        MatMult_SeqSELL,
1795                                        /* 4*/ MatMultAdd_SeqSELL,
1796                                        MatMultTranspose_SeqSELL,
1797                                        MatMultTransposeAdd_SeqSELL,
1798                                        NULL,
1799                                        NULL,
1800                                        NULL,
1801                                        /* 10*/ NULL,
1802                                        NULL,
1803                                        NULL,
1804                                        MatSOR_SeqSELL,
1805                                        NULL,
1806                                        /* 15*/ MatGetInfo_SeqSELL,
1807                                        MatEqual_SeqSELL,
1808                                        MatGetDiagonal_SeqSELL,
1809                                        MatDiagonalScale_SeqSELL,
1810                                        NULL,
1811                                        /* 20*/ NULL,
1812                                        MatAssemblyEnd_SeqSELL,
1813                                        MatSetOption_SeqSELL,
1814                                        MatZeroEntries_SeqSELL,
1815                                        /* 24*/ NULL,
1816                                        NULL,
1817                                        NULL,
1818                                        NULL,
1819                                        NULL,
1820                                        /* 29*/ MatSetUp_SeqSELL,
1821                                        NULL,
1822                                        NULL,
1823                                        NULL,
1824                                        NULL,
1825                                        /* 34*/ MatDuplicate_SeqSELL,
1826                                        NULL,
1827                                        NULL,
1828                                        NULL,
1829                                        NULL,
1830                                        /* 39*/ NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        MatGetValues_SeqSELL,
1834                                        MatCopy_SeqSELL,
1835                                        /* 44*/ NULL,
1836                                        MatScale_SeqSELL,
1837                                        MatShift_SeqSELL,
1838                                        NULL,
1839                                        NULL,
1840                                        /* 49*/ NULL,
1841                                        NULL,
1842                                        NULL,
1843                                        NULL,
1844                                        NULL,
1845                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1846                                        NULL,
1847                                        NULL,
1848                                        NULL,
1849                                        NULL,
1850                                        /* 59*/ NULL,
1851                                        MatDestroy_SeqSELL,
1852                                        MatView_SeqSELL,
1853                                        NULL,
1854                                        NULL,
1855                                        /* 64*/ NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        NULL,
1859                                        NULL,
1860                                        /* 69*/ NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        NULL,
1864                                        NULL,
1865                                        /* 74*/ NULL,
1866                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1867                                        NULL,
1868                                        NULL,
1869                                        NULL,
1870                                        /* 79*/ NULL,
1871                                        NULL,
1872                                        NULL,
1873                                        NULL,
1874                                        NULL,
1875                                        /* 84*/ NULL,
1876                                        NULL,
1877                                        NULL,
1878                                        NULL,
1879                                        NULL,
1880                                        /* 89*/ NULL,
1881                                        NULL,
1882                                        NULL,
1883                                        NULL,
1884                                        NULL,
1885                                        /* 94*/ NULL,
1886                                        NULL,
1887                                        NULL,
1888                                        NULL,
1889                                        NULL,
1890                                        /* 99*/ NULL,
1891                                        NULL,
1892                                        NULL,
1893                                        MatConjugate_SeqSELL,
1894                                        NULL,
1895                                        /*104*/ NULL,
1896                                        NULL,
1897                                        NULL,
1898                                        NULL,
1899                                        NULL,
1900                                        /*109*/ NULL,
1901                                        NULL,
1902                                        NULL,
1903                                        NULL,
1904                                        MatMissingDiagonal_SeqSELL,
1905                                        /*114*/ NULL,
1906                                        NULL,
1907                                        NULL,
1908                                        NULL,
1909                                        NULL,
1910                                        /*119*/ NULL,
1911                                        NULL,
1912                                        NULL,
1913                                        NULL,
1914                                        NULL,
1915                                        /*124*/ NULL,
1916                                        NULL,
1917                                        NULL,
1918                                        NULL,
1919                                        NULL,
1920                                        /*129*/ NULL,
1921                                        NULL,
1922                                        NULL,
1923                                        NULL,
1924                                        NULL,
1925                                        /*134*/ NULL,
1926                                        NULL,
1927                                        NULL,
1928                                        NULL,
1929                                        NULL,
1930                                        /*139*/ NULL,
1931                                        NULL,
1932                                        NULL,
1933                                        MatFDColoringSetUp_SeqXAIJ,
1934                                        NULL,
1935                                        /*144*/ NULL,
1936                                        NULL,
1937                                        NULL,
1938                                        NULL,
1939                                        NULL,
1940                                        NULL,
1941                                        /*150*/ NULL,
1942                                        NULL};
1943 
1944 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1945 {
1946   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1947 
1948   PetscFunctionBegin;
1949   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1950 
1951   /* allocate space for values if not already there */
1952   if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1953 
1954   /* copy values over */
1955   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1956   PetscFunctionReturn(PETSC_SUCCESS);
1957 }
1958 
1959 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1960 {
1961   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1962 
1963   PetscFunctionBegin;
1964   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1965   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1966   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1967   PetscFunctionReturn(PETSC_SUCCESS);
1968 }
1969 
1970 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
1971 {
1972   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1973 
1974   PetscFunctionBegin;
1975   if (a->totalslices && a->sliidx[a->totalslices]) {
1976     *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
1977   } else {
1978     *ratio = 0.0;
1979   }
1980   PetscFunctionReturn(PETSC_SUCCESS);
1981 }
1982 
1983 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
1984 {
1985   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1986   PetscInt     i, current_slicewidth;
1987 
1988   PetscFunctionBegin;
1989   *slicewidth = 0;
1990   for (i = 0; i < a->totalslices; i++) {
1991     current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
1992     if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
1993   }
1994   PetscFunctionReturn(PETSC_SUCCESS);
1995 }
1996 
1997 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
1998 {
1999   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2000 
2001   PetscFunctionBegin;
2002   *slicewidth = 0;
2003   if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; }
2004   PetscFunctionReturn(PETSC_SUCCESS);
2005 }
2006 
2007 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance)
2008 {
2009   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2010   PetscReal    mean;
2011   PetscInt     i, totalslices = a->totalslices, *sliidx = a->sliidx;
2012 
2013   PetscFunctionBegin;
2014   *variance = 0;
2015   if (totalslices) {
2016     mean = (PetscReal)sliidx[totalslices] / totalslices;
2017     for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; }
2018   }
2019   PetscFunctionReturn(PETSC_SUCCESS);
2020 }
2021 
2022 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2023 {
2024   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2025 
2026   PetscFunctionBegin;
2027   if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2028   PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2029   a->sliceheight = sliceheight;
2030 #if defined(PETSC_HAVE_CUDA)
2031   PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight);
2032 #endif
2033   PetscFunctionReturn(PETSC_SUCCESS);
2034 }
2035 
2036 /*@C
2037   MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2038 
2039   Not Collective
2040 
2041   Input Parameter:
2042 . A - a MATSEQSELL matrix
2043 
2044   Output Parameter:
2045 . ratio - ratio of number of padded zeros to number of allocated elements
2046 
2047   Level: intermediate
2048 
2049 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2050 @*/
2051 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2052 {
2053   PetscFunctionBegin;
2054   PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2055   PetscFunctionReturn(PETSC_SUCCESS);
2056 }
2057 
2058 /*@C
2059   MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2060 
2061   Not Collective
2062 
2063   Input Parameter:
2064 . A - a MATSEQSELL matrix
2065 
2066   Output Parameter:
2067 . slicewidth - maximum slice width
2068 
2069   Level: intermediate
2070 
2071 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2072 @*/
2073 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2074 {
2075   PetscFunctionBegin;
2076   PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2077   PetscFunctionReturn(PETSC_SUCCESS);
2078 }
2079 
2080 /*@C
2081   MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2082 
2083   Not Collective
2084 
2085   Input Parameter:
2086 . A - a MATSEQSELL matrix
2087 
2088   Output Parameter:
2089 . slicewidth - average slice width
2090 
2091   Level: intermediate
2092 
2093 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()`
2094 @*/
2095 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2096 {
2097   PetscFunctionBegin;
2098   PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2099   PetscFunctionReturn(PETSC_SUCCESS);
2100 }
2101 
2102 /*@C
2103   MatSeqSELLSetSliceHeight - sets the slice height.
2104 
2105   Not Collective
2106 
2107   Input Parameters:
2108 + A           - a MATSEQSELL matrix
2109 - sliceheight - slice height
2110 
2111   Notes:
2112   You cannot change the slice height once it have been set.
2113 
2114   The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2115 
2116   Level: intermediate
2117 
2118 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()`
2119 @*/
2120 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2121 {
2122   PetscFunctionBegin;
2123   PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2124   PetscFunctionReturn(PETSC_SUCCESS);
2125 }
2126 
2127 /*@C
2128   MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2129 
2130   Not Collective
2131 
2132   Input Parameter:
2133 . A - a MATSEQSELL matrix
2134 
2135   Output Parameter:
2136 . variance - variance of the slice size
2137 
2138   Level: intermediate
2139 
2140 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()`
2141 @*/
2142 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2143 {
2144   PetscFunctionBegin;
2145   PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2146   PetscFunctionReturn(PETSC_SUCCESS);
2147 }
2148 
2149 #if defined(PETSC_HAVE_CUDA)
2150 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2151 #endif
2152 
2153 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2154 {
2155   Mat_SeqSELL *b;
2156   PetscMPIInt  size;
2157 
2158   PetscFunctionBegin;
2159   PetscCall(PetscCitationsRegister(citation, &cited));
2160   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2161   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2162 
2163   PetscCall(PetscNew(&b));
2164 
2165   B->data   = (void *)b;
2166   B->ops[0] = MatOps_Values;
2167 
2168   b->row                = NULL;
2169   b->col                = NULL;
2170   b->icol               = NULL;
2171   b->reallocs           = 0;
2172   b->ignorezeroentries  = PETSC_FALSE;
2173   b->roworiented        = PETSC_TRUE;
2174   b->nonew              = 0;
2175   b->diag               = NULL;
2176   b->solve_work         = NULL;
2177   B->spptr              = NULL;
2178   b->saved_values       = NULL;
2179   b->idiag              = NULL;
2180   b->mdiag              = NULL;
2181   b->ssor_work          = NULL;
2182   b->omega              = 1.0;
2183   b->fshift             = 0.0;
2184   b->idiagvalid         = PETSC_FALSE;
2185   b->keepnonzeropattern = PETSC_FALSE;
2186   b->sliceheight        = 0;
2187 
2188   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2189   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2190   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2191   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2192   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2193   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2194   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
2195 #if defined(PETSC_HAVE_CUDA)
2196   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA));
2197 #endif
2198   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2199   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2200   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2201   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL));
2202   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2203 
2204   PetscObjectOptionsBegin((PetscObject)B);
2205   {
2206     PetscInt  newsh = -1;
2207     PetscBool flg;
2208 #if defined(PETSC_HAVE_CUDA)
2209     PetscInt chunksize = 0;
2210 #endif
2211 
2212     PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2213     if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); }
2214 #if defined(PETSC_HAVE_CUDA)
2215     PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg));
2216     if (flg) {
2217       PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize);
2218       b->chunksize = chunksize;
2219     }
2220 #endif
2221   }
2222   PetscOptionsEnd();
2223   PetscFunctionReturn(PETSC_SUCCESS);
2224 }
2225 
2226 /*
2227  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2228  */
2229 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2230 {
2231   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2232   PetscInt     i, m                           = A->rmap->n;
2233   PetscInt     totalslices = a->totalslices;
2234 
2235   PetscFunctionBegin;
2236   C->factortype = A->factortype;
2237   c->row        = NULL;
2238   c->col        = NULL;
2239   c->icol       = NULL;
2240   c->reallocs   = 0;
2241   C->assembled  = PETSC_TRUE;
2242 
2243   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2244   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2245 
2246   PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen));
2247   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2248 
2249   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2250   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2251 
2252   /* allocate the matrix space */
2253   if (mallocmatspace) {
2254     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2255 
2256     c->singlemalloc = PETSC_TRUE;
2257 
2258     if (m > 0) {
2259       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2260       if (cpvalues == MAT_COPY_VALUES) {
2261         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2262       } else {
2263         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2264       }
2265     }
2266   }
2267 
2268   c->ignorezeroentries = a->ignorezeroentries;
2269   c->roworiented       = a->roworiented;
2270   c->nonew             = a->nonew;
2271   if (a->diag) {
2272     PetscCall(PetscMalloc1(m, &c->diag));
2273     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2274   } else c->diag = NULL;
2275 
2276   c->solve_work         = NULL;
2277   c->saved_values       = NULL;
2278   c->idiag              = NULL;
2279   c->ssor_work          = NULL;
2280   c->keepnonzeropattern = a->keepnonzeropattern;
2281   c->free_val           = PETSC_TRUE;
2282   c->free_colidx        = PETSC_TRUE;
2283 
2284   c->maxallocmat  = a->maxallocmat;
2285   c->maxallocrow  = a->maxallocrow;
2286   c->rlenmax      = a->rlenmax;
2287   c->nz           = a->nz;
2288   C->preallocated = PETSC_TRUE;
2289 
2290   c->nonzerorowcnt = a->nonzerorowcnt;
2291   C->nonzerostate  = A->nonzerostate;
2292 
2293   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2294   PetscFunctionReturn(PETSC_SUCCESS);
2295 }
2296 
2297 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2298 {
2299   PetscFunctionBegin;
2300   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2301   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2302   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2303   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2304   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2305   PetscFunctionReturn(PETSC_SUCCESS);
2306 }
2307 
2308 /*MC
2309    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2310    based on the sliced Ellpack format
2311 
2312    Options Database Key:
2313 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2314 
2315    Level: beginner
2316 
2317 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2318 M*/
2319 
2320 /*MC
2321    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2322 
2323    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2324    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2325   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2326   for communicators controlling multiple processes.  It is recommended that you call both of
2327   the above preallocation routines for simplicity.
2328 
2329    Options Database Key:
2330 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2331 
2332   Level: beginner
2333 
2334   Notes:
2335    This format is only supported for real scalars, double precision, and 32-bit indices (the defaults).
2336 
2337    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2338    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2339 
2340   Developer Notes:
2341    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2342 
2343    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2344 .vb
2345                             (2 0  3 4)
2346    Consider the matrix A =  (5 0  6 0)
2347                             (0 0  7 8)
2348                             (0 0  9 9)
2349 
2350    symbolically the Ellpack format can be written as
2351 
2352         (2 3 4 |)           (0 2 3 |)
2353    v =  (5 6 0 |)  colidx = (0 2 2 |)
2354         --------            ---------
2355         (7 8 |)             (2 3 |)
2356         (9 9 |)             (2 3 |)
2357 
2358     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2359     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2360     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2361 
2362     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2363 
2364 .ve
2365 
2366       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2367 
2368  References:
2369 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2370    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2371 
2372 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2373 M*/
2374 
2375 /*@C
2376   MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2377 
2378   Collective
2379 
2380   Input Parameters:
2381 + comm    - MPI communicator, set to `PETSC_COMM_SELF`
2382 . m       - number of rows
2383 . n       - number of columns
2384 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2385 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2386 
2387   Output Parameter:
2388 . A - the matrix
2389 
2390   Level: intermediate
2391 
2392   Notes:
2393   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2394   MatXXXXSetPreallocation() paradigm instead of this routine directly.
2395   [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2396 
2397   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2398   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2399   allocation.
2400 
2401 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL`
2402  @*/
2403 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2404 {
2405   PetscFunctionBegin;
2406   PetscCall(MatCreate(comm, A));
2407   PetscCall(MatSetSizes(*A, m, n, m, n));
2408   PetscCall(MatSetType(*A, MATSEQSELL));
2409   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2410   PetscFunctionReturn(PETSC_SUCCESS);
2411 }
2412 
2413 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2414 {
2415   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2416   PetscInt     totalslices = a->totalslices;
2417 
2418   PetscFunctionBegin;
2419   /* If the  matrix dimensions are not equal,or no of nonzeros */
2420   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2421     *flg = PETSC_FALSE;
2422     PetscFunctionReturn(PETSC_SUCCESS);
2423   }
2424   /* if the a->colidx are the same */
2425   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2426   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2427   /* if a->val are the same */
2428   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2429   PetscFunctionReturn(PETSC_SUCCESS);
2430 }
2431 
2432 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2433 {
2434   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2435 
2436   PetscFunctionBegin;
2437   a->idiagvalid = PETSC_FALSE;
2438   PetscFunctionReturn(PETSC_SUCCESS);
2439 }
2440 
2441 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2442 {
2443 #if defined(PETSC_USE_COMPLEX)
2444   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2445   PetscInt     i;
2446   PetscScalar *val = a->val;
2447 
2448   PetscFunctionBegin;
2449   for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); }
2450   #if defined(PETSC_HAVE_CUDA)
2451   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2452   #endif
2453 #else
2454   PetscFunctionBegin;
2455 #endif
2456   PetscFunctionReturn(PETSC_SUCCESS);
2457 }
2458