xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 2d30e087755efd99e28fdfe792ffbeb2ee1ea928)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited      = PETSC_FALSE;
10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14                                " year = 2018\n"
15                                "}\n";
16 
17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18 
19 #include <immintrin.h>
20 
21 #if !defined(_MM_SCALE_8)
22 #define _MM_SCALE_8 8
23 #endif
24 
25 #if defined(__AVX512F__)
26 /* these do not work
27    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29   */
30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31   /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32   vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
33   vec_vals = _mm512_loadu_pd(aval); \
34   vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35   vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36 #elif defined(__AVX2__) && defined(__FMA__)
37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38   vec_vals = _mm256_loadu_pd(aval); \
39   vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40   vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41   vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42 #endif
43 #endif /* PETSC_HAVE_IMMINTRIN_H */
44 
45 /*@C
46  MatSeqSELLSetPreallocation - For good matrix assembly performance
47  the user should preallocate the matrix storage by setting the parameter nz
48  (or the array nnz).  By setting these parameters accurately, performance
49  during matrix assembly can be increased significantly.
50 
51  Collective
52 
53  Input Parameters:
54  +  B - The `MATSEQSELL` matrix
55  .  nz - number of nonzeros per row (same for all rows)
56  -  nnz - array containing the number of nonzeros in the various rows
57  (possibly different for each row) or NULL
58 
59  Notes:
60  If nnz is given then nz is ignored.
61 
62  Specify the preallocated storage with either nz or nnz (not both).
63  Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
64  allocation.  For large problems you MUST preallocate memory or you
65  will get TERRIBLE performance, see the users' manual chapter on matrices.
66 
67  You can call `MatGetInfo()` to get information on how effective the preallocation was;
68  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
69  You can also run with the option -info and look for messages with the string
70  malloc in them to see if additional memory allocation was needed.
71 
72  Developers: Use nz of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
73  entries or columns indices.
74 
75  The maximum number of nonzeos in any row should be as accurate as possible.
76  If it is underestimated, you will get bad performance due to reallocation
77  (MatSeqXSELLReallocateSELL).
78 
79  Level: intermediate
80 
81  .seealso: `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
82  @*/
83 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) {
84   PetscFunctionBegin;
85   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
86   PetscValidType(B, 1);
87   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
88   PetscFunctionReturn(0);
89 }
90 
91 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) {
92   Mat_SeqSELL *b;
93   PetscInt     i, j, totalslices;
94   PetscBool    skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
95 
96   PetscFunctionBegin;
97   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
98   if (maxallocrow == MAT_SKIP_ALLOCATION) {
99     skipallocation = PETSC_TRUE;
100     maxallocrow    = 0;
101   }
102 
103   PetscCall(PetscLayoutSetUp(B->rmap));
104   PetscCall(PetscLayoutSetUp(B->cmap));
105 
106   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
107   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
108   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
109   if (rlen) {
110     for (i = 0; i < B->rmap->n; i++) {
111       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
112       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
113     }
114   }
115 
116   B->preallocated = PETSC_TRUE;
117 
118   b = (Mat_SeqSELL *)B->data;
119 
120   totalslices    = PetscCeilInt(B->rmap->n, 8);
121   b->totalslices = totalslices;
122   if (!skipallocation) {
123     if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n));
124 
125     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
126       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
127       PetscCall(PetscLogObjectMemory((PetscObject)B, (totalslices + 1) * sizeof(PetscInt)));
128     }
129     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
130       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
131       else if (maxallocrow < 0) maxallocrow = 1;
132       for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow;
133     } else {
134       maxallocrow  = 0;
135       b->sliidx[0] = 0;
136       for (i = 1; i < totalslices; i++) {
137         b->sliidx[i] = 0;
138         for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]);
139         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
140         PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i]));
141       }
142       /* last slice */
143       b->sliidx[totalslices] = 0;
144       for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
145       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
146       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices];
147     }
148 
149     /* allocate space for val, colidx, rlen */
150     /* FIXME: should B's old memory be unlogged? */
151     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
152     /* FIXME: assuming an element of the bit array takes 8 bits */
153     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
154     PetscCall(PetscLogObjectMemory((PetscObject)B, b->sliidx[totalslices] * (sizeof(PetscScalar) + sizeof(PetscInt))));
155     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
156     PetscCall(PetscCalloc1(8 * totalslices, &b->rlen));
157     PetscCall(PetscLogObjectMemory((PetscObject)B, 8 * totalslices * sizeof(PetscInt)));
158 
159     b->singlemalloc = PETSC_TRUE;
160     b->free_val     = PETSC_TRUE;
161     b->free_colidx  = PETSC_TRUE;
162   } else {
163     b->free_val    = PETSC_FALSE;
164     b->free_colidx = PETSC_FALSE;
165   }
166 
167   b->nz               = 0;
168   b->maxallocrow      = maxallocrow;
169   b->rlenmax          = maxallocrow;
170   b->maxallocmat      = b->sliidx[totalslices];
171   B->info.nz_unneeded = (double)b->maxallocmat;
172   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
173   PetscFunctionReturn(0);
174 }
175 
176 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) {
177   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
178   PetscInt     shift;
179 
180   PetscFunctionBegin;
181   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
182   if (nz) *nz = a->rlen[row];
183   shift = a->sliidx[row >> 3] + (row & 0x07);
184   if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals));
185   if (idx) {
186     PetscInt j;
187     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j];
188     *idx = a->getrowcols;
189   }
190   if (v) {
191     PetscInt j;
192     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j];
193     *v = a->getrowvals;
194   }
195   PetscFunctionReturn(0);
196 }
197 
198 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) {
199   PetscFunctionBegin;
200   PetscFunctionReturn(0);
201 }
202 
203 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) {
204   Mat          B;
205   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
206   PetscInt     i;
207 
208   PetscFunctionBegin;
209   if (reuse == MAT_REUSE_MATRIX) {
210     B = *newmat;
211     PetscCall(MatZeroEntries(B));
212   } else {
213     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
214     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
215     PetscCall(MatSetType(B, MATSEQAIJ));
216     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
217   }
218 
219   for (i = 0; i < A->rmap->n; i++) {
220     PetscInt     nz = 0, *cols = NULL;
221     PetscScalar *vals = NULL;
222 
223     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
224     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
225     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
226   }
227 
228   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
229   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
230   B->rmap->bs = A->rmap->bs;
231 
232   if (reuse == MAT_INPLACE_MATRIX) {
233     PetscCall(MatHeaderReplace(A, &B));
234   } else {
235     *newmat = B;
236   }
237   PetscFunctionReturn(0);
238 }
239 
240 #include <../src/mat/impls/aij/seq/aij.h>
241 
242 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) {
243   Mat                B;
244   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
245   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
246   const PetscInt    *cols;
247   const PetscScalar *vals;
248 
249   PetscFunctionBegin;
250 
251   if (reuse == MAT_REUSE_MATRIX) {
252     B = *newmat;
253   } else {
254     if (PetscDefined(USE_DEBUG) || !a->ilen) {
255       PetscCall(PetscMalloc1(m, &rowlengths));
256       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
257     }
258     if (PetscDefined(USE_DEBUG) && a->ilen) {
259       PetscBool eq;
260       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
261       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
262       PetscCall(PetscFree(rowlengths));
263       rowlengths = a->ilen;
264     } else if (a->ilen) rowlengths = a->ilen;
265     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
266     PetscCall(MatSetSizes(B, m, n, m, n));
267     PetscCall(MatSetType(B, MATSEQSELL));
268     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
269     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
270   }
271 
272   for (row = 0; row < m; row++) {
273     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
274     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
275     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
276   }
277   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
278   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
279   B->rmap->bs = A->rmap->bs;
280 
281   if (reuse == MAT_INPLACE_MATRIX) {
282     PetscCall(MatHeaderReplace(A, &B));
283   } else {
284     *newmat = B;
285   }
286   PetscFunctionReturn(0);
287 }
288 
289 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) {
290   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
291   PetscScalar       *y;
292   const PetscScalar *x;
293   const MatScalar   *aval        = a->val;
294   PetscInt           totalslices = a->totalslices;
295   const PetscInt    *acolidx     = a->colidx;
296   PetscInt           i, j;
297 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
298   __m512d  vec_x, vec_y, vec_vals;
299   __m256i  vec_idx;
300   __mmask8 mask;
301   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
302   __m256i  vec_idx2, vec_idx3, vec_idx4;
303 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
304   __m128i   vec_idx;
305   __m256d   vec_x, vec_y, vec_y2, vec_vals;
306   MatScalar yval;
307   PetscInt  r, rows_left, row, nnz_in_row;
308 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
309   __m128d   vec_x_tmp;
310   __m256d   vec_x, vec_y, vec_y2, vec_vals;
311   MatScalar yval;
312   PetscInt  r, rows_left, row, nnz_in_row;
313 #else
314   PetscScalar sum[8];
315 #endif
316 
317 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
318 #pragma disjoint(*x, *y, *aval)
319 #endif
320 
321   PetscFunctionBegin;
322   PetscCall(VecGetArrayRead(xx, &x));
323   PetscCall(VecGetArray(yy, &y));
324 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
325   for (i = 0; i < totalslices; i++) { /* loop over slices */
326     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
327     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
328 
329     vec_y  = _mm512_setzero_pd();
330     vec_y2 = _mm512_setzero_pd();
331     vec_y3 = _mm512_setzero_pd();
332     vec_y4 = _mm512_setzero_pd();
333 
334     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */
335     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
336     case 3:
337       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
338       acolidx += 8;
339       aval += 8;
340       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
341       acolidx += 8;
342       aval += 8;
343       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
344       acolidx += 8;
345       aval += 8;
346       j += 3;
347       break;
348     case 2:
349       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
350       acolidx += 8;
351       aval += 8;
352       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
353       acolidx += 8;
354       aval += 8;
355       j += 2;
356       break;
357     case 1:
358       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
359       acolidx += 8;
360       aval += 8;
361       j += 1;
362       break;
363     }
364 #pragma novector
365     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
366       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
367       acolidx += 8;
368       aval += 8;
369       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
370       acolidx += 8;
371       aval += 8;
372       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
373       acolidx += 8;
374       aval += 8;
375       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
376       acolidx += 8;
377       aval += 8;
378     }
379 
380     vec_y = _mm512_add_pd(vec_y, vec_y2);
381     vec_y = _mm512_add_pd(vec_y, vec_y3);
382     vec_y = _mm512_add_pd(vec_y, vec_y4);
383     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
384       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
385       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
386     } else {
387       _mm512_storeu_pd(&y[8 * i], vec_y);
388     }
389   }
390 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
391   for (i = 0; i < totalslices; i++) { /* loop over full slices */
392     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
393     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
394 
395     /* last slice may have padding rows. Don't use vectorization. */
396     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
397       rows_left = A->rmap->n - 8 * i;
398       for (r = 0; r < rows_left; ++r) {
399         yval       = (MatScalar)0;
400         row        = 8 * i + r;
401         nnz_in_row = a->rlen[row];
402         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
403         y[row] = yval;
404       }
405       break;
406     }
407 
408     vec_y  = _mm256_setzero_pd();
409     vec_y2 = _mm256_setzero_pd();
410 
411 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
412 #pragma novector
413 #pragma unroll(2)
414     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
415       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
416       aval += 4;
417       acolidx += 4;
418       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
419       aval += 4;
420       acolidx += 4;
421     }
422 
423     _mm256_storeu_pd(y + i * 8, vec_y);
424     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
425   }
426 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
427   for (i = 0; i < totalslices; i++) { /* loop over full slices */
428     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
429     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
430 
431     vec_y  = _mm256_setzero_pd();
432     vec_y2 = _mm256_setzero_pd();
433 
434     /* last slice may have padding rows. Don't use vectorization. */
435     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
436       rows_left = A->rmap->n - 8 * i;
437       for (r = 0; r < rows_left; ++r) {
438         yval       = (MatScalar)0;
439         row        = 8 * i + r;
440         nnz_in_row = a->rlen[row];
441         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
442         y[row] = yval;
443       }
444       break;
445     }
446 
447 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
448 #pragma novector
449 #pragma unroll(2)
450     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
451       vec_vals  = _mm256_loadu_pd(aval);
452       vec_x_tmp = _mm_setzero_pd();
453       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
454       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
455       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
456       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
457       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
458       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
459       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
460       aval += 4;
461 
462       vec_vals  = _mm256_loadu_pd(aval);
463       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
464       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
465       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
466       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
467       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
468       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
469       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
470       aval += 4;
471     }
472 
473     _mm256_storeu_pd(y + i * 8, vec_y);
474     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
475   }
476 #else
477   for (i = 0; i < totalslices; i++) { /* loop over slices */
478     for (j = 0; j < 8; j++) sum[j] = 0.0;
479     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
480       sum[0] += aval[j] * x[acolidx[j]];
481       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
482       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
483       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
484       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
485       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
486       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
487       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
488     }
489     if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
490       for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j];
491     } else {
492       for (j = 0; j < 8; j++) y[8 * i + j] = sum[j];
493     }
494   }
495 #endif
496 
497   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
498   PetscCall(VecRestoreArrayRead(xx, &x));
499   PetscCall(VecRestoreArray(yy, &y));
500   PetscFunctionReturn(0);
501 }
502 
503 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
504 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) {
505   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
506   PetscScalar       *y, *z;
507   const PetscScalar *x;
508   const MatScalar   *aval        = a->val;
509   PetscInt           totalslices = a->totalslices;
510   const PetscInt    *acolidx     = a->colidx;
511   PetscInt           i, j;
512 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
513   __m512d  vec_x, vec_y, vec_vals;
514   __m256i  vec_idx;
515   __mmask8 mask;
516   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
517   __m256i  vec_idx2, vec_idx3, vec_idx4;
518 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
519   __m128d   vec_x_tmp;
520   __m256d   vec_x, vec_y, vec_y2, vec_vals;
521   MatScalar yval;
522   PetscInt  r, row, nnz_in_row;
523 #else
524   PetscScalar sum[8];
525 #endif
526 
527 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
528 #pragma disjoint(*x, *y, *aval)
529 #endif
530 
531   PetscFunctionBegin;
532   PetscCall(VecGetArrayRead(xx, &x));
533   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
534 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
535   for (i = 0; i < totalslices; i++) { /* loop over slices */
536     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
537     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
538 
539     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
540       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
541       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
542     } else {
543       vec_y = _mm512_loadu_pd(&y[8 * i]);
544     }
545     vec_y2 = _mm512_setzero_pd();
546     vec_y3 = _mm512_setzero_pd();
547     vec_y4 = _mm512_setzero_pd();
548 
549     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */
550     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
551     case 3:
552       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
553       acolidx += 8;
554       aval += 8;
555       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
556       acolidx += 8;
557       aval += 8;
558       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
559       acolidx += 8;
560       aval += 8;
561       j += 3;
562       break;
563     case 2:
564       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
565       acolidx += 8;
566       aval += 8;
567       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
568       acolidx += 8;
569       aval += 8;
570       j += 2;
571       break;
572     case 1:
573       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
574       acolidx += 8;
575       aval += 8;
576       j += 1;
577       break;
578     }
579 #pragma novector
580     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
581       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
582       acolidx += 8;
583       aval += 8;
584       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
585       acolidx += 8;
586       aval += 8;
587       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
588       acolidx += 8;
589       aval += 8;
590       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
591       acolidx += 8;
592       aval += 8;
593     }
594 
595     vec_y = _mm512_add_pd(vec_y, vec_y2);
596     vec_y = _mm512_add_pd(vec_y, vec_y3);
597     vec_y = _mm512_add_pd(vec_y, vec_y4);
598     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
599       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
600     } else {
601       _mm512_storeu_pd(&z[8 * i], vec_y);
602     }
603   }
604 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
605   for (i = 0; i < totalslices; i++) { /* loop over full slices */
606     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
607     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
608 
609     /* last slice may have padding rows. Don't use vectorization. */
610     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
611       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
612         row        = 8 * i + r;
613         yval       = (MatScalar)0.0;
614         nnz_in_row = a->rlen[row];
615         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
616         z[row] = y[row] + yval;
617       }
618       break;
619     }
620 
621     vec_y  = _mm256_loadu_pd(y + 8 * i);
622     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
623 
624     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
625     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
626       vec_vals  = _mm256_loadu_pd(aval);
627       vec_x_tmp = _mm_setzero_pd();
628       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
629       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
630       vec_x     = _mm256_setzero_pd();
631       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
632       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
633       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
634       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
635       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
636       aval += 4;
637 
638       vec_vals  = _mm256_loadu_pd(aval);
639       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
640       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
641       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
642       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
643       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
644       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
645       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
646       aval += 4;
647     }
648 
649     _mm256_storeu_pd(z + i * 8, vec_y);
650     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
651   }
652 #else
653   for (i = 0; i < totalslices; i++) { /* loop over slices */
654     for (j = 0; j < 8; j++) sum[j] = 0.0;
655     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
656       sum[0] += aval[j] * x[acolidx[j]];
657       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
658       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
659       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
660       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
661       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
662       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
663       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
664     }
665     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
666       for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j];
667     } else {
668       for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j];
669     }
670   }
671 #endif
672 
673   PetscCall(PetscLogFlops(2.0 * a->nz));
674   PetscCall(VecRestoreArrayRead(xx, &x));
675   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
676   PetscFunctionReturn(0);
677 }
678 
679 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) {
680   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
681   PetscScalar       *y;
682   const PetscScalar *x;
683   const MatScalar   *aval    = a->val;
684   const PetscInt    *acolidx = a->colidx;
685   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices;
686 
687 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
688 #pragma disjoint(*x, *y, *aval)
689 #endif
690 
691   PetscFunctionBegin;
692   if (A->symmetric == PETSC_BOOL3_TRUE) {
693     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
694     PetscFunctionReturn(0);
695   }
696   if (zz != yy) PetscCall(VecCopy(zz, yy));
697   PetscCall(VecGetArrayRead(xx, &x));
698   PetscCall(VecGetArray(yy, &y));
699   for (i = 0; i < a->totalslices; i++) { /* loop over slices */
700     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
701       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
702         row        = 8 * i + r;
703         nnz_in_row = a->rlen[row];
704         for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row];
705       }
706       break;
707     }
708     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
709       y[acolidx[j]] += aval[j] * x[8 * i];
710       y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1];
711       y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2];
712       y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3];
713       y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4];
714       y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5];
715       y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6];
716       y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7];
717     }
718   }
719   PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices]));
720   PetscCall(VecRestoreArrayRead(xx, &x));
721   PetscCall(VecRestoreArray(yy, &y));
722   PetscFunctionReturn(0);
723 }
724 
725 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) {
726   PetscFunctionBegin;
727   if (A->symmetric == PETSC_BOOL3_TRUE) {
728     PetscCall(MatMult_SeqSELL(A, xx, yy));
729   } else {
730     PetscCall(VecSet(yy, 0.0));
731     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
732   }
733   PetscFunctionReturn(0);
734 }
735 
736 /*
737      Checks for missing diagonals
738 */
739 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) {
740   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
741   PetscInt    *diag, i;
742 
743   PetscFunctionBegin;
744   *missing = PETSC_FALSE;
745   if (A->rmap->n > 0 && !(a->colidx)) {
746     *missing = PETSC_TRUE;
747     if (d) *d = 0;
748     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
749   } else {
750     diag = a->diag;
751     for (i = 0; i < A->rmap->n; i++) {
752       if (diag[i] == -1) {
753         *missing = PETSC_TRUE;
754         if (d) *d = i;
755         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
756         break;
757       }
758     }
759   }
760   PetscFunctionReturn(0);
761 }
762 
763 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) {
764   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
765   PetscInt     i, j, m = A->rmap->n, shift;
766 
767   PetscFunctionBegin;
768   if (!a->diag) {
769     PetscCall(PetscMalloc1(m, &a->diag));
770     PetscCall(PetscLogObjectMemory((PetscObject)A, m * sizeof(PetscInt)));
771     a->free_diag = PETSC_TRUE;
772   }
773   for (i = 0; i < m; i++) {                      /* loop over rows */
774     shift      = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
775     a->diag[i] = -1;
776     for (j = 0; j < a->rlen[i]; j++) {
777       if (a->colidx[shift + j * 8] == i) {
778         a->diag[i] = shift + j * 8;
779         break;
780       }
781     }
782   }
783   PetscFunctionReturn(0);
784 }
785 
786 /*
787   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
788 */
789 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) {
790   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
791   PetscInt     i, *diag, m = A->rmap->n;
792   MatScalar   *val = a->val;
793   PetscScalar *idiag, *mdiag;
794 
795   PetscFunctionBegin;
796   if (a->idiagvalid) PetscFunctionReturn(0);
797   PetscCall(MatMarkDiagonal_SeqSELL(A));
798   diag = a->diag;
799   if (!a->idiag) {
800     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
801     PetscCall(PetscLogObjectMemory((PetscObject)A, 3 * m * sizeof(PetscScalar)));
802     val = a->val;
803   }
804   mdiag = a->mdiag;
805   idiag = a->idiag;
806 
807   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
808     for (i = 0; i < m; i++) {
809       mdiag[i] = val[diag[i]];
810       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
811         if (PetscRealPart(fshift)) {
812           PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
813           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
814           A->factorerror_zeropivot_value = 0.0;
815           A->factorerror_zeropivot_row   = i;
816         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
817       }
818       idiag[i] = 1.0 / val[diag[i]];
819     }
820     PetscCall(PetscLogFlops(m));
821   } else {
822     for (i = 0; i < m; i++) {
823       mdiag[i] = val[diag[i]];
824       idiag[i] = omega / (fshift + val[diag[i]]);
825     }
826     PetscCall(PetscLogFlops(2.0 * m));
827   }
828   a->idiagvalid = PETSC_TRUE;
829   PetscFunctionReturn(0);
830 }
831 
832 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) {
833   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
834 
835   PetscFunctionBegin;
836   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
837   PetscCall(MatSeqSELLInvalidateDiagonal(A));
838   PetscFunctionReturn(0);
839 }
840 
841 PetscErrorCode MatDestroy_SeqSELL(Mat A) {
842   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
843 
844   PetscFunctionBegin;
845 #if defined(PETSC_USE_LOG)
846   PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz);
847 #endif
848   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
849   PetscCall(ISDestroy(&a->row));
850   PetscCall(ISDestroy(&a->col));
851   PetscCall(PetscFree(a->diag));
852   PetscCall(PetscFree(a->rlen));
853   PetscCall(PetscFree(a->sliidx));
854   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
855   PetscCall(PetscFree(a->solve_work));
856   PetscCall(ISDestroy(&a->icol));
857   PetscCall(PetscFree(a->saved_values));
858   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
859 
860   PetscCall(PetscFree(A->data));
861 
862   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
863   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
864   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
865   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
866   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
867   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
868   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
869   PetscFunctionReturn(0);
870 }
871 
872 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) {
873   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
874 
875   PetscFunctionBegin;
876   switch (op) {
877   case MAT_ROW_ORIENTED: a->roworiented = flg; break;
878   case MAT_KEEP_NONZERO_PATTERN: a->keepnonzeropattern = flg; break;
879   case MAT_NEW_NONZERO_LOCATIONS: a->nonew = (flg ? 0 : 1); break;
880   case MAT_NEW_NONZERO_LOCATION_ERR: a->nonew = (flg ? -1 : 0); break;
881   case MAT_NEW_NONZERO_ALLOCATION_ERR: a->nonew = (flg ? -2 : 0); break;
882   case MAT_UNUSED_NONZERO_LOCATION_ERR: a->nounused = (flg ? -1 : 0); break;
883   case MAT_FORCE_DIAGONAL_ENTRIES:
884   case MAT_IGNORE_OFF_PROC_ENTRIES:
885   case MAT_USE_HASH_TABLE:
886   case MAT_SORTED_FULL: PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); break;
887   case MAT_SPD:
888   case MAT_SYMMETRIC:
889   case MAT_STRUCTURALLY_SYMMETRIC:
890   case MAT_HERMITIAN:
891   case MAT_SYMMETRY_ETERNAL:
892   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
893   case MAT_SPD_ETERNAL:
894     /* These options are handled directly by MatSetOption() */
895     break;
896   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
897   }
898   PetscFunctionReturn(0);
899 }
900 
901 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) {
902   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
903   PetscInt     i, j, n, shift;
904   PetscScalar *x, zero = 0.0;
905 
906   PetscFunctionBegin;
907   PetscCall(VecGetLocalSize(v, &n));
908   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
909 
910   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
911     PetscInt *diag = a->diag;
912     PetscCall(VecGetArray(v, &x));
913     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
914     PetscCall(VecRestoreArray(v, &x));
915     PetscFunctionReturn(0);
916   }
917 
918   PetscCall(VecSet(v, zero));
919   PetscCall(VecGetArray(v, &x));
920   for (i = 0; i < n; i++) {                 /* loop over rows */
921     shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
922     x[i]  = 0;
923     for (j = 0; j < a->rlen[i]; j++) {
924       if (a->colidx[shift + j * 8] == i) {
925         x[i] = a->val[shift + j * 8];
926         break;
927       }
928     }
929   }
930   PetscCall(VecRestoreArray(v, &x));
931   PetscFunctionReturn(0);
932 }
933 
934 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) {
935   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
936   const PetscScalar *l, *r;
937   PetscInt           i, j, m, n, row;
938 
939   PetscFunctionBegin;
940   if (ll) {
941     /* The local size is used so that VecMPI can be passed to this routine
942        by MatDiagonalScale_MPISELL */
943     PetscCall(VecGetLocalSize(ll, &m));
944     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
945     PetscCall(VecGetArrayRead(ll, &l));
946     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
947       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
948         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
949           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row];
950         }
951       } else {
952         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row];
953       }
954     }
955     PetscCall(VecRestoreArrayRead(ll, &l));
956     PetscCall(PetscLogFlops(a->nz));
957   }
958   if (rr) {
959     PetscCall(VecGetLocalSize(rr, &n));
960     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
961     PetscCall(VecGetArrayRead(rr, &r));
962     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
963       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
964         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
965           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
966         }
967       } else {
968         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
969       }
970     }
971     PetscCall(VecRestoreArrayRead(rr, &r));
972     PetscCall(PetscLogFlops(a->nz));
973   }
974   PetscCall(MatSeqSELLInvalidateDiagonal(A));
975   PetscFunctionReturn(0);
976 }
977 
978 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) {
979   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
980   PetscInt    *cp, i, k, low, high, t, row, col, l;
981   PetscInt     shift;
982   MatScalar   *vp;
983 
984   PetscFunctionBegin;
985   for (k = 0; k < m; k++) { /* loop over requested rows */
986     row = im[k];
987     if (row < 0) continue;
988     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
989     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
990     cp    = a->colidx + shift;                  /* pointer to the row */
991     vp    = a->val + shift;                     /* pointer to the row */
992     for (l = 0; l < n; l++) {                   /* loop over requested columns */
993       col = in[l];
994       if (col < 0) continue;
995       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
996       high = a->rlen[row];
997       low  = 0; /* assume unsorted */
998       while (high - low > 5) {
999         t = (low + high) / 2;
1000         if (*(cp + t * 8) > col) high = t;
1001         else low = t;
1002       }
1003       for (i = low; i < high; i++) {
1004         if (*(cp + 8 * i) > col) break;
1005         if (*(cp + 8 * i) == col) {
1006           *v++ = *(vp + 8 * i);
1007           goto finished;
1008         }
1009       }
1010       *v++ = 0.0;
1011     finished:;
1012     }
1013   }
1014   PetscFunctionReturn(0);
1015 }
1016 
1017 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) {
1018   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1019   PetscInt          i, j, m = A->rmap->n, shift;
1020   const char       *name;
1021   PetscViewerFormat format;
1022 
1023   PetscFunctionBegin;
1024   PetscCall(PetscViewerGetFormat(viewer, &format));
1025   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1026     PetscInt nofinalvalue = 0;
1027     /*
1028     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1029       nofinalvalue = 1;
1030     }
1031     */
1032     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1033     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1034     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1035 #if defined(PETSC_USE_COMPLEX)
1036     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1037 #else
1038     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1039 #endif
1040     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1041 
1042     for (i = 0; i < m; i++) {
1043       shift = a->sliidx[i >> 3] + (i & 0x07);
1044       for (j = 0; j < a->rlen[i]; j++) {
1045 #if defined(PETSC_USE_COMPLEX)
1046         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1047 #else
1048         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j]));
1049 #endif
1050       }
1051     }
1052     /*
1053     if (nofinalvalue) {
1054 #if defined(PETSC_USE_COMPLEX)
1055       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1056 #else
1057       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1058 #endif
1059     }
1060     */
1061     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1062     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1063     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1064   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1065     PetscFunctionReturn(0);
1066   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1067     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1068     for (i = 0; i < m; i++) {
1069       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1070       shift = a->sliidx[i >> 3] + (i & 0x07);
1071       for (j = 0; j < a->rlen[i]; j++) {
1072 #if defined(PETSC_USE_COMPLEX)
1073         if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1074           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1075         } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1076           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1077         } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1078           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1079         }
1080 #else
1081         if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1082 #endif
1083       }
1084       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1085     }
1086     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1087   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1088     PetscInt    cnt = 0, jcnt;
1089     PetscScalar value;
1090 #if defined(PETSC_USE_COMPLEX)
1091     PetscBool realonly = PETSC_TRUE;
1092     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1093       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1094         realonly = PETSC_FALSE;
1095         break;
1096       }
1097     }
1098 #endif
1099 
1100     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1101     for (i = 0; i < m; i++) {
1102       jcnt  = 0;
1103       shift = a->sliidx[i >> 3] + (i & 0x07);
1104       for (j = 0; j < A->cmap->n; j++) {
1105         if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) {
1106           value = a->val[cnt++];
1107           jcnt++;
1108         } else {
1109           value = 0.0;
1110         }
1111 #if defined(PETSC_USE_COMPLEX)
1112         if (realonly) {
1113           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1114         } else {
1115           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1116         }
1117 #else
1118         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1119 #endif
1120       }
1121       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1122     }
1123     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1124   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1125     PetscInt fshift = 1;
1126     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1127 #if defined(PETSC_USE_COMPLEX)
1128     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1129 #else
1130     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1131 #endif
1132     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1133     for (i = 0; i < m; i++) {
1134       shift = a->sliidx[i >> 3] + (i & 0x07);
1135       for (j = 0; j < a->rlen[i]; j++) {
1136 #if defined(PETSC_USE_COMPLEX)
1137         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1138 #else
1139         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j]));
1140 #endif
1141       }
1142     }
1143     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1144   } else if (format == PETSC_VIEWER_NATIVE) {
1145     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1146       PetscInt row;
1147       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1148       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
1149 #if defined(PETSC_USE_COMPLEX)
1150         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1151           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1152         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1153           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1154         } else {
1155           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1156         }
1157 #else
1158         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j]));
1159 #endif
1160       }
1161     }
1162   } else {
1163     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1164     if (A->factortype) {
1165       for (i = 0; i < m; i++) {
1166         shift = a->sliidx[i >> 3] + (i & 0x07);
1167         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1168         /* L part */
1169         for (j = shift; j < a->diag[i]; j += 8) {
1170 #if defined(PETSC_USE_COMPLEX)
1171           if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) {
1172             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1173           } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) {
1174             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1175           } else {
1176             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1177           }
1178 #else
1179           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1180 #endif
1181         }
1182         /* diagonal */
1183         j = a->diag[i];
1184 #if defined(PETSC_USE_COMPLEX)
1185         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1186           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1187         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1188           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1189         } else {
1190           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1191         }
1192 #else
1193         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1194 #endif
1195 
1196         /* U part */
1197         for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) {
1198 #if defined(PETSC_USE_COMPLEX)
1199           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1200             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1201           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1202             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1203           } else {
1204             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1205           }
1206 #else
1207           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1208 #endif
1209         }
1210         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1211       }
1212     } else {
1213       for (i = 0; i < m; i++) {
1214         shift = a->sliidx[i >> 3] + (i & 0x07);
1215         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1216         for (j = 0; j < a->rlen[i]; j++) {
1217 #if defined(PETSC_USE_COMPLEX)
1218           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1219             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1220           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1221             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1222           } else {
1223             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1224           }
1225 #else
1226           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1227 #endif
1228         }
1229         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1230       }
1231     }
1232     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1233   }
1234   PetscCall(PetscViewerFlush(viewer));
1235   PetscFunctionReturn(0);
1236 }
1237 
1238 #include <petscdraw.h>
1239 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) {
1240   Mat               A = (Mat)Aa;
1241   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1242   PetscInt          i, j, m = A->rmap->n, shift;
1243   int               color;
1244   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1245   PetscViewer       viewer;
1246   PetscViewerFormat format;
1247 
1248   PetscFunctionBegin;
1249   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1250   PetscCall(PetscViewerGetFormat(viewer, &format));
1251   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1252 
1253   /* loop over matrix elements drawing boxes */
1254 
1255   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1256     PetscDrawCollectiveBegin(draw);
1257     /* Blue for negative, Cyan for zero and  Red for positive */
1258     color = PETSC_DRAW_BLUE;
1259     for (i = 0; i < m; i++) {
1260       shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1261       y_l   = m - i - 1.0;
1262       y_r   = y_l + 1.0;
1263       for (j = 0; j < a->rlen[i]; j++) {
1264         x_l = a->colidx[shift + j * 8];
1265         x_r = x_l + 1.0;
1266         if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue;
1267         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1268       }
1269     }
1270     color = PETSC_DRAW_CYAN;
1271     for (i = 0; i < m; i++) {
1272       shift = a->sliidx[i >> 3] + (i & 0x07);
1273       y_l   = m - i - 1.0;
1274       y_r   = y_l + 1.0;
1275       for (j = 0; j < a->rlen[i]; j++) {
1276         x_l = a->colidx[shift + j * 8];
1277         x_r = x_l + 1.0;
1278         if (a->val[shift + 8 * j] != 0.) continue;
1279         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1280       }
1281     }
1282     color = PETSC_DRAW_RED;
1283     for (i = 0; i < m; i++) {
1284       shift = a->sliidx[i >> 3] + (i & 0x07);
1285       y_l   = m - i - 1.0;
1286       y_r   = y_l + 1.0;
1287       for (j = 0; j < a->rlen[i]; j++) {
1288         x_l = a->colidx[shift + j * 8];
1289         x_r = x_l + 1.0;
1290         if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue;
1291         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1292       }
1293     }
1294     PetscDrawCollectiveEnd(draw);
1295   } else {
1296     /* use contour shading to indicate magnitude of values */
1297     /* first determine max of all nonzero values */
1298     PetscReal minv = 0.0, maxv = 0.0;
1299     PetscInt  count = 0;
1300     PetscDraw popup;
1301     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1302       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1303     }
1304     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1305     PetscCall(PetscDrawGetPopup(draw, &popup));
1306     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1307 
1308     PetscDrawCollectiveBegin(draw);
1309     for (i = 0; i < m; i++) {
1310       shift = a->sliidx[i >> 3] + (i & 0x07);
1311       y_l   = m - i - 1.0;
1312       y_r   = y_l + 1.0;
1313       for (j = 0; j < a->rlen[i]; j++) {
1314         x_l   = a->colidx[shift + j * 8];
1315         x_r   = x_l + 1.0;
1316         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1317         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1318         count++;
1319       }
1320     }
1321     PetscDrawCollectiveEnd(draw);
1322   }
1323   PetscFunctionReturn(0);
1324 }
1325 
1326 #include <petscdraw.h>
1327 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) {
1328   PetscDraw draw;
1329   PetscReal xr, yr, xl, yl, h, w;
1330   PetscBool isnull;
1331 
1332   PetscFunctionBegin;
1333   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1334   PetscCall(PetscDrawIsNull(draw, &isnull));
1335   if (isnull) PetscFunctionReturn(0);
1336 
1337   xr = A->cmap->n;
1338   yr = A->rmap->n;
1339   h  = yr / 10.0;
1340   w  = xr / 10.0;
1341   xr += w;
1342   yr += h;
1343   xl = -w;
1344   yl = -h;
1345   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1346   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1347   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1348   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1349   PetscCall(PetscDrawSave(draw));
1350   PetscFunctionReturn(0);
1351 }
1352 
1353 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) {
1354   PetscBool iascii, isbinary, isdraw;
1355 
1356   PetscFunctionBegin;
1357   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1358   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1359   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1360   if (iascii) {
1361     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1362   } else if (isbinary) {
1363     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1364   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1365   PetscFunctionReturn(0);
1366 }
1367 
1368 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) {
1369   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1370   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1371   MatScalar   *vp;
1372 
1373   PetscFunctionBegin;
1374   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1375   /* To do: compress out the unused elements */
1376   PetscCall(MatMarkDiagonal_SeqSELL(A));
1377   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1378   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1379   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1380   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1381   for (i = 0; i < a->totalslices; ++i) {
1382     shift = a->sliidx[i];                                      /* starting index of the slice */
1383     cp    = a->colidx + shift;                                 /* pointer to the column indices of the slice */
1384     vp    = a->val + shift;                                    /* pointer to the nonzero values of the slice */
1385     for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */
1386       row     = 8 * i + row_in_slice;
1387       nrow    = a->rlen[row]; /* number of nonzeros in row */
1388       /*
1389         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1390         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1391       */
1392       lastcol = 0;
1393       if (nrow > 0) {                                /* nonempty row */
1394         lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1395       } else if (!row_in_slice) {                    /* first row of the currect slice is empty */
1396         for (j = 1; j < 8; j++) {
1397           if (a->rlen[8 * i + j]) {
1398             lastcol = cp[j];
1399             break;
1400           }
1401         }
1402       } else {
1403         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1404       }
1405 
1406       for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) {
1407         cp[8 * k + row_in_slice] = lastcol;
1408         vp[8 * k + row_in_slice] = (MatScalar)0;
1409       }
1410     }
1411   }
1412 
1413   A->info.mallocs += a->reallocs;
1414   a->reallocs = 0;
1415 
1416   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1417   PetscFunctionReturn(0);
1418 }
1419 
1420 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) {
1421   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1422 
1423   PetscFunctionBegin;
1424   info->block_size   = 1.0;
1425   info->nz_allocated = a->maxallocmat;
1426   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1427   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1428   info->assemblies   = A->num_ass;
1429   info->mallocs      = A->info.mallocs;
1430   info->memory       = ((PetscObject)A)->mem;
1431   if (A->factortype) {
1432     info->fill_ratio_given  = A->info.fill_ratio_given;
1433     info->fill_ratio_needed = A->info.fill_ratio_needed;
1434     info->factor_mallocs    = A->info.factor_mallocs;
1435   } else {
1436     info->fill_ratio_given  = 0;
1437     info->fill_ratio_needed = 0;
1438     info->factor_mallocs    = 0;
1439   }
1440   PetscFunctionReturn(0);
1441 }
1442 
1443 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) {
1444   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1445   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1446   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1447   MatScalar   *vp, value;
1448 
1449   PetscFunctionBegin;
1450   for (k = 0; k < m; k++) { /* loop over added rows */
1451     row = im[k];
1452     if (row < 0) continue;
1453     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1454     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1455     cp    = a->colidx + shift;                  /* pointer to the row */
1456     vp    = a->val + shift;                     /* pointer to the row */
1457     nrow  = a->rlen[row];
1458     low   = 0;
1459     high  = nrow;
1460 
1461     for (l = 0; l < n; l++) { /* loop over added columns */
1462       col = in[l];
1463       if (col < 0) continue;
1464       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1465       if (a->roworiented) {
1466         value = v[l + k * n];
1467       } else {
1468         value = v[k + l * m];
1469       }
1470       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1471 
1472       /* search in this row for the specified column, i indicates the column to be set */
1473       if (col <= lastcol) low = 0;
1474       else high = nrow;
1475       lastcol = col;
1476       while (high - low > 5) {
1477         t = (low + high) / 2;
1478         if (*(cp + t * 8) > col) high = t;
1479         else low = t;
1480       }
1481       for (i = low; i < high; i++) {
1482         if (*(cp + i * 8) > col) break;
1483         if (*(cp + i * 8) == col) {
1484           if (is == ADD_VALUES) *(vp + i * 8) += value;
1485           else *(vp + i * 8) = value;
1486           low = i + 1;
1487           goto noinsert;
1488         }
1489       }
1490       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1491       if (nonew == 1) goto noinsert;
1492       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1493       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1494       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar);
1495       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1496       for (ii = nrow - 1; ii >= i; ii--) {
1497         *(cp + (ii + 1) * 8) = *(cp + ii * 8);
1498         *(vp + (ii + 1) * 8) = *(vp + ii * 8);
1499       }
1500       a->rlen[row]++;
1501       *(cp + i * 8) = col;
1502       *(vp + i * 8) = value;
1503       a->nz++;
1504       A->nonzerostate++;
1505       low = i + 1;
1506       high++;
1507       nrow++;
1508     noinsert:;
1509     }
1510     a->rlen[row] = nrow;
1511   }
1512   PetscFunctionReturn(0);
1513 }
1514 
1515 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) {
1516   PetscFunctionBegin;
1517   /* If the two matrices have the same copy implementation, use fast copy. */
1518   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1519     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1520     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1521 
1522     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1523     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1524   } else {
1525     PetscCall(MatCopy_Basic(A, B, str));
1526   }
1527   PetscFunctionReturn(0);
1528 }
1529 
1530 PetscErrorCode MatSetUp_SeqSELL(Mat A) {
1531   PetscFunctionBegin;
1532   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1533   PetscFunctionReturn(0);
1534 }
1535 
1536 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) {
1537   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1538 
1539   PetscFunctionBegin;
1540   *array = a->val;
1541   PetscFunctionReturn(0);
1542 }
1543 
1544 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) {
1545   PetscFunctionBegin;
1546   PetscFunctionReturn(0);
1547 }
1548 
1549 PetscErrorCode MatRealPart_SeqSELL(Mat A) {
1550   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1551   PetscInt     i;
1552   MatScalar   *aval = a->val;
1553 
1554   PetscFunctionBegin;
1555   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1556   PetscFunctionReturn(0);
1557 }
1558 
1559 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) {
1560   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1561   PetscInt     i;
1562   MatScalar   *aval = a->val;
1563 
1564   PetscFunctionBegin;
1565   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1566   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1567   PetscFunctionReturn(0);
1568 }
1569 
1570 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) {
1571   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1572   MatScalar   *aval   = a->val;
1573   PetscScalar  oalpha = alpha;
1574   PetscBLASInt one    = 1, size;
1575 
1576   PetscFunctionBegin;
1577   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1578   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1579   PetscCall(PetscLogFlops(a->nz));
1580   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1581   PetscFunctionReturn(0);
1582 }
1583 
1584 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) {
1585   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1586 
1587   PetscFunctionBegin;
1588   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1589   PetscCall(MatShift_Basic(Y, a));
1590   PetscFunctionReturn(0);
1591 }
1592 
1593 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) {
1594   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1595   PetscScalar       *x, sum, *t;
1596   const MatScalar   *idiag = NULL, *mdiag;
1597   const PetscScalar *b, *xb;
1598   PetscInt           n, m = A->rmap->n, i, j, shift;
1599   const PetscInt    *diag;
1600 
1601   PetscFunctionBegin;
1602   its = its * lits;
1603 
1604   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1605   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1606   a->fshift = fshift;
1607   a->omega  = omega;
1608 
1609   diag  = a->diag;
1610   t     = a->ssor_work;
1611   idiag = a->idiag;
1612   mdiag = a->mdiag;
1613 
1614   PetscCall(VecGetArray(xx, &x));
1615   PetscCall(VecGetArrayRead(bb, &b));
1616   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1617   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1618   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1619   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1620 
1621   if (flag & SOR_ZERO_INITIAL_GUESS) {
1622     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1623       for (i = 0; i < m; i++) {
1624         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1625         sum   = b[i];
1626         n     = (diag[i] - shift) / 8;
1627         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1628         t[i] = sum;
1629         x[i] = sum * idiag[i];
1630       }
1631       xb = t;
1632       PetscCall(PetscLogFlops(a->nz));
1633     } else xb = b;
1634     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1635       for (i = m - 1; i >= 0; i--) {
1636         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1637         sum   = xb[i];
1638         n     = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1639         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1640         if (xb == b) {
1641           x[i] = sum * idiag[i];
1642         } else {
1643           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1644         }
1645       }
1646       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1647     }
1648     its--;
1649   }
1650   while (its--) {
1651     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1652       for (i = 0; i < m; i++) {
1653         /* lower */
1654         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1655         sum   = b[i];
1656         n     = (diag[i] - shift) / 8;
1657         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1658         t[i] = sum; /* save application of the lower-triangular part */
1659         /* upper */
1660         n    = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1661         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1662         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1663       }
1664       xb = t;
1665       PetscCall(PetscLogFlops(2.0 * a->nz));
1666     } else xb = b;
1667     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1668       for (i = m - 1; i >= 0; i--) {
1669         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1670         sum   = xb[i];
1671         if (xb == b) {
1672           /* whole matrix (no checkpointing available) */
1673           n = a->rlen[i];
1674           for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1675           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1676         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1677           n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1678           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1679           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1680         }
1681       }
1682       if (xb == b) {
1683         PetscCall(PetscLogFlops(2.0 * a->nz));
1684       } else {
1685         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1686       }
1687     }
1688   }
1689   PetscCall(VecRestoreArray(xx, &x));
1690   PetscCall(VecRestoreArrayRead(bb, &b));
1691   PetscFunctionReturn(0);
1692 }
1693 
1694 /* -------------------------------------------------------------------*/
1695 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1696                                        MatGetRow_SeqSELL,
1697                                        MatRestoreRow_SeqSELL,
1698                                        MatMult_SeqSELL,
1699                                        /* 4*/ MatMultAdd_SeqSELL,
1700                                        MatMultTranspose_SeqSELL,
1701                                        MatMultTransposeAdd_SeqSELL,
1702                                        NULL,
1703                                        NULL,
1704                                        NULL,
1705                                        /* 10*/ NULL,
1706                                        NULL,
1707                                        NULL,
1708                                        MatSOR_SeqSELL,
1709                                        NULL,
1710                                        /* 15*/ MatGetInfo_SeqSELL,
1711                                        MatEqual_SeqSELL,
1712                                        MatGetDiagonal_SeqSELL,
1713                                        MatDiagonalScale_SeqSELL,
1714                                        NULL,
1715                                        /* 20*/ NULL,
1716                                        MatAssemblyEnd_SeqSELL,
1717                                        MatSetOption_SeqSELL,
1718                                        MatZeroEntries_SeqSELL,
1719                                        /* 24*/ NULL,
1720                                        NULL,
1721                                        NULL,
1722                                        NULL,
1723                                        NULL,
1724                                        /* 29*/ MatSetUp_SeqSELL,
1725                                        NULL,
1726                                        NULL,
1727                                        NULL,
1728                                        NULL,
1729                                        /* 34*/ MatDuplicate_SeqSELL,
1730                                        NULL,
1731                                        NULL,
1732                                        NULL,
1733                                        NULL,
1734                                        /* 39*/ NULL,
1735                                        NULL,
1736                                        NULL,
1737                                        MatGetValues_SeqSELL,
1738                                        MatCopy_SeqSELL,
1739                                        /* 44*/ NULL,
1740                                        MatScale_SeqSELL,
1741                                        MatShift_SeqSELL,
1742                                        NULL,
1743                                        NULL,
1744                                        /* 49*/ NULL,
1745                                        NULL,
1746                                        NULL,
1747                                        NULL,
1748                                        NULL,
1749                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1750                                        NULL,
1751                                        NULL,
1752                                        NULL,
1753                                        NULL,
1754                                        /* 59*/ NULL,
1755                                        MatDestroy_SeqSELL,
1756                                        MatView_SeqSELL,
1757                                        NULL,
1758                                        NULL,
1759                                        /* 64*/ NULL,
1760                                        NULL,
1761                                        NULL,
1762                                        NULL,
1763                                        NULL,
1764                                        /* 69*/ NULL,
1765                                        NULL,
1766                                        NULL,
1767                                        NULL,
1768                                        NULL,
1769                                        /* 74*/ NULL,
1770                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1771                                        NULL,
1772                                        NULL,
1773                                        NULL,
1774                                        /* 79*/ NULL,
1775                                        NULL,
1776                                        NULL,
1777                                        NULL,
1778                                        NULL,
1779                                        /* 84*/ NULL,
1780                                        NULL,
1781                                        NULL,
1782                                        NULL,
1783                                        NULL,
1784                                        /* 89*/ NULL,
1785                                        NULL,
1786                                        NULL,
1787                                        NULL,
1788                                        NULL,
1789                                        /* 94*/ NULL,
1790                                        NULL,
1791                                        NULL,
1792                                        NULL,
1793                                        NULL,
1794                                        /* 99*/ NULL,
1795                                        NULL,
1796                                        NULL,
1797                                        MatConjugate_SeqSELL,
1798                                        NULL,
1799                                        /*104*/ NULL,
1800                                        NULL,
1801                                        NULL,
1802                                        NULL,
1803                                        NULL,
1804                                        /*109*/ NULL,
1805                                        NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        MatMissingDiagonal_SeqSELL,
1809                                        /*114*/ NULL,
1810                                        NULL,
1811                                        NULL,
1812                                        NULL,
1813                                        NULL,
1814                                        /*119*/ NULL,
1815                                        NULL,
1816                                        NULL,
1817                                        NULL,
1818                                        NULL,
1819                                        /*124*/ NULL,
1820                                        NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        NULL,
1824                                        /*129*/ NULL,
1825                                        NULL,
1826                                        NULL,
1827                                        NULL,
1828                                        NULL,
1829                                        /*134*/ NULL,
1830                                        NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        NULL,
1834                                        /*139*/ NULL,
1835                                        NULL,
1836                                        NULL,
1837                                        MatFDColoringSetUp_SeqXAIJ,
1838                                        NULL,
1839                                        /*144*/ NULL,
1840                                        NULL,
1841                                        NULL,
1842                                        NULL,
1843                                        NULL,
1844                                        NULL,
1845                                        /*150*/ NULL};
1846 
1847 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) {
1848   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1849 
1850   PetscFunctionBegin;
1851   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1852 
1853   /* allocate space for values if not already there */
1854   if (!a->saved_values) {
1855     PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1856     PetscCall(PetscLogObjectMemory((PetscObject)mat, (a->sliidx[a->totalslices] + 1) * sizeof(PetscScalar)));
1857   }
1858 
1859   /* copy values over */
1860   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1861   PetscFunctionReturn(0);
1862 }
1863 
1864 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) {
1865   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1866 
1867   PetscFunctionBegin;
1868   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1869   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1870   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1871   PetscFunctionReturn(0);
1872 }
1873 
1874 /*@C
1875  MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
1876 
1877  Not Collective
1878 
1879  Input Parameters:
1880  .  mat - a `MATSEQSELL` matrix
1881  .  array - pointer to the data
1882 
1883  Level: intermediate
1884 
1885  .seealso: `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1886  @*/
1887 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) {
1888   PetscFunctionBegin;
1889   PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
1890   PetscFunctionReturn(0);
1891 }
1892 
1893 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) {
1894   Mat_SeqSELL *b;
1895   PetscMPIInt  size;
1896 
1897   PetscFunctionBegin;
1898   PetscCall(PetscCitationsRegister(citation, &cited));
1899   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
1900   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
1901 
1902   PetscCall(PetscNewLog(B, &b));
1903 
1904   B->data = (void *)b;
1905 
1906   PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
1907 
1908   b->row                = NULL;
1909   b->col                = NULL;
1910   b->icol               = NULL;
1911   b->reallocs           = 0;
1912   b->ignorezeroentries  = PETSC_FALSE;
1913   b->roworiented        = PETSC_TRUE;
1914   b->nonew              = 0;
1915   b->diag               = NULL;
1916   b->solve_work         = NULL;
1917   B->spptr              = NULL;
1918   b->saved_values       = NULL;
1919   b->idiag              = NULL;
1920   b->mdiag              = NULL;
1921   b->ssor_work          = NULL;
1922   b->omega              = 1.0;
1923   b->fshift             = 0.0;
1924   b->idiagvalid         = PETSC_FALSE;
1925   b->keepnonzeropattern = PETSC_FALSE;
1926 
1927   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
1928   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
1929   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
1930   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
1931   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
1932   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
1933   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
1934   PetscFunctionReturn(0);
1935 }
1936 
1937 /*
1938  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1939  */
1940 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) {
1941   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
1942   PetscInt     i, m                           = A->rmap->n;
1943   PetscInt     totalslices = a->totalslices;
1944 
1945   PetscFunctionBegin;
1946   C->factortype = A->factortype;
1947   c->row        = NULL;
1948   c->col        = NULL;
1949   c->icol       = NULL;
1950   c->reallocs   = 0;
1951   C->assembled  = PETSC_TRUE;
1952 
1953   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
1954   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
1955 
1956   PetscCall(PetscMalloc1(8 * totalslices, &c->rlen));
1957   PetscCall(PetscLogObjectMemory((PetscObject)C, m * sizeof(PetscInt)));
1958   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
1959   PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices + 1) * sizeof(PetscInt)));
1960 
1961   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
1962   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
1963 
1964   /* allocate the matrix space */
1965   if (mallocmatspace) {
1966     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
1967     PetscCall(PetscLogObjectMemory((PetscObject)C, a->maxallocmat * (sizeof(PetscScalar) + sizeof(PetscInt))));
1968 
1969     c->singlemalloc = PETSC_TRUE;
1970 
1971     if (m > 0) {
1972       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
1973       if (cpvalues == MAT_COPY_VALUES) {
1974         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
1975       } else {
1976         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
1977       }
1978     }
1979   }
1980 
1981   c->ignorezeroentries = a->ignorezeroentries;
1982   c->roworiented       = a->roworiented;
1983   c->nonew             = a->nonew;
1984   if (a->diag) {
1985     PetscCall(PetscMalloc1(m, &c->diag));
1986     PetscCall(PetscLogObjectMemory((PetscObject)C, m * sizeof(PetscInt)));
1987     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
1988   } else c->diag = NULL;
1989 
1990   c->solve_work         = NULL;
1991   c->saved_values       = NULL;
1992   c->idiag              = NULL;
1993   c->ssor_work          = NULL;
1994   c->keepnonzeropattern = a->keepnonzeropattern;
1995   c->free_val           = PETSC_TRUE;
1996   c->free_colidx        = PETSC_TRUE;
1997 
1998   c->maxallocmat  = a->maxallocmat;
1999   c->maxallocrow  = a->maxallocrow;
2000   c->rlenmax      = a->rlenmax;
2001   c->nz           = a->nz;
2002   C->preallocated = PETSC_TRUE;
2003 
2004   c->nonzerorowcnt = a->nonzerorowcnt;
2005   C->nonzerostate  = A->nonzerostate;
2006 
2007   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2008   PetscFunctionReturn(0);
2009 }
2010 
2011 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) {
2012   PetscFunctionBegin;
2013   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2014   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2015   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2016   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2017   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2018   PetscFunctionReturn(0);
2019 }
2020 
2021 /*MC
2022    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2023    based on the sliced Ellpack format
2024 
2025    Options Database Keys:
2026 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2027 
2028    Level: beginner
2029 
2030 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2031 M*/
2032 
2033 /*MC
2034    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2035 
2036    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2037    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2038   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2039   for communicators controlling multiple processes.  It is recommended that you call both of
2040   the above preallocation routines for simplicity.
2041 
2042    Options Database Keys:
2043 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2044 
2045   Level: beginner
2046 
2047   Notes:
2048    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2049 
2050    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2051    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2052 
2053   Developer Notes:
2054    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2055 
2056    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2057 .vb
2058                             (2 0  3 4)
2059    Consider the matrix A =  (5 0  6 0)
2060                             (0 0  7 8)
2061                             (0 0  9 9)
2062 
2063    symbolically the Ellpack format can be written as
2064 
2065         (2 3 4 |)           (0 2 3 |)
2066    v =  (5 6 0 |)  colidx = (0 2 2 |)
2067         --------            ---------
2068         (7 8 |)             (2 3 |)
2069         (9 9 |)             (2 3 |)
2070 
2071     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2072     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2073     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2074 
2075     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2076 
2077 .ve
2078 
2079       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2080 
2081  References:
2082 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2083    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2084 
2085 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2086 M*/
2087 
2088 /*@C
2089        MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2090 
2091  Collective on comm
2092 
2093  Input Parameters:
2094 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
2095 .  m - number of rows
2096 .  n - number of columns
2097 .  rlenmax - maximum number of nonzeros in a row
2098 -  rlen - array containing the number of nonzeros in the various rows
2099  (possibly different for each row) or NULL
2100 
2101  Output Parameter:
2102 .  A - the matrix
2103 
2104  It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2105  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2106  [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2107 
2108  Notes:
2109  If nnz is given then nz is ignored
2110 
2111  Specify the preallocated storage with either rlenmax or rlen (not both).
2112  Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory
2113  allocation.  For large problems you MUST preallocate memory or you
2114  will get TERRIBLE performance, see the users' manual chapter on matrices.
2115 
2116  Level: intermediate
2117 
2118  .seealso: `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2119  @*/
2120 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A) {
2121   PetscFunctionBegin;
2122   PetscCall(MatCreate(comm, A));
2123   PetscCall(MatSetSizes(*A, m, n, m, n));
2124   PetscCall(MatSetType(*A, MATSEQSELL));
2125   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen));
2126   PetscFunctionReturn(0);
2127 }
2128 
2129 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) {
2130   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2131   PetscInt     totalslices = a->totalslices;
2132 
2133   PetscFunctionBegin;
2134   /* If the  matrix dimensions are not equal,or no of nonzeros */
2135   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2136     *flg = PETSC_FALSE;
2137     PetscFunctionReturn(0);
2138   }
2139   /* if the a->colidx are the same */
2140   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2141   if (!*flg) PetscFunctionReturn(0);
2142   /* if a->val are the same */
2143   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2144   PetscFunctionReturn(0);
2145 }
2146 
2147 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) {
2148   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2149 
2150   PetscFunctionBegin;
2151   a->idiagvalid = PETSC_FALSE;
2152   PetscFunctionReturn(0);
2153 }
2154 
2155 PetscErrorCode MatConjugate_SeqSELL(Mat A) {
2156 #if defined(PETSC_USE_COMPLEX)
2157   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2158   PetscInt     i;
2159   PetscScalar *val = a->val;
2160 
2161   PetscFunctionBegin;
2162   for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2163 #else
2164   PetscFunctionBegin;
2165 #endif
2166   PetscFunctionReturn(0);
2167 }
2168