xref: /petsc/src/mat/impls/sell/seq/sell.c (revision c77c71ff2d9eaa2c74538bf9bf94eff01b512dbf)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited      = PETSC_FALSE;
10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14                                " year = 2018\n"
15                                "}\n";
16 
17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18 
19   #include <immintrin.h>
20 
21   #if !defined(_MM_SCALE_8)
22     #define _MM_SCALE_8 8
23   #endif
24 
25   #if defined(__AVX512F__)
26     /* these do not work
27    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29   */
30     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
33       vec_vals = _mm512_loadu_pd(aval); \
34       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36   #elif defined(__AVX2__) && defined(__FMA__)
37     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38       vec_vals = _mm256_loadu_pd(aval); \
39       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42   #endif
43 #endif /* PETSC_HAVE_IMMINTRIN_H */
44 
45 /*@C
46  MatSeqSELLSetPreallocation - For good matrix assembly performance
47  the user should preallocate the matrix storage by setting the parameter `nz`
48  (or the array `nnz`).
49 
50  Collective
51 
52  Input Parameters:
53 +  B - The `MATSEQSELL` matrix
54 .  rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
55 -  rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
56 
57  Level: intermediate
58 
59  Notes:
60  Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
61  Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
62  allocation.
63 
64  You can call `MatGetInfo()` to get information on how effective the preallocation was;
65  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
66  You can also run with the option `-info` and look for messages with the string
67  malloc in them to see if additional memory allocation was needed.
68 
69  Developer Note:
70  Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
71  entries or columns indices.
72 
73  The maximum number of nonzeos in any row should be as accurate as possible.
74  If it is underestimated, you will get bad performance due to reallocation
75  (`MatSeqXSELLReallocateSELL()`).
76 
77 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
78  @*/
79 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
80 {
81   PetscFunctionBegin;
82   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
83   PetscValidType(B, 1);
84   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
85   PetscFunctionReturn(PETSC_SUCCESS);
86 }
87 
88 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
89 {
90   Mat_SeqSELL *b;
91   PetscInt     i, j, totalslices;
92   PetscBool    skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
93 
94   PetscFunctionBegin;
95   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
96   if (maxallocrow == MAT_SKIP_ALLOCATION) {
97     skipallocation = PETSC_TRUE;
98     maxallocrow    = 0;
99   }
100 
101   PetscCall(PetscLayoutSetUp(B->rmap));
102   PetscCall(PetscLayoutSetUp(B->cmap));
103 
104   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
105   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
106   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
107   if (rlen) {
108     for (i = 0; i < B->rmap->n; i++) {
109       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
110       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
111     }
112   }
113 
114   B->preallocated = PETSC_TRUE;
115 
116   b = (Mat_SeqSELL *)B->data;
117 
118   totalslices    = PetscCeilInt(B->rmap->n, 8);
119   b->totalslices = totalslices;
120   if (!skipallocation) {
121     if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n));
122 
123     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
124       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
125     }
126     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
127       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
128       else if (maxallocrow < 0) maxallocrow = 1;
129       for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow;
130     } else {
131       maxallocrow  = 0;
132       b->sliidx[0] = 0;
133       for (i = 1; i < totalslices; i++) {
134         b->sliidx[i] = 0;
135         for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]);
136         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
137         PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i]));
138       }
139       /* last slice */
140       b->sliidx[totalslices] = 0;
141       for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
142       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
143       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices];
144     }
145 
146     /* allocate space for val, colidx, rlen */
147     /* FIXME: should B's old memory be unlogged? */
148     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
149     /* FIXME: assuming an element of the bit array takes 8 bits */
150     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
151     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
152     PetscCall(PetscCalloc1(8 * totalslices, &b->rlen));
153 
154     b->singlemalloc = PETSC_TRUE;
155     b->free_val     = PETSC_TRUE;
156     b->free_colidx  = PETSC_TRUE;
157   } else {
158     b->free_val    = PETSC_FALSE;
159     b->free_colidx = PETSC_FALSE;
160   }
161 
162   b->nz               = 0;
163   b->maxallocrow      = maxallocrow;
164   b->rlenmax          = maxallocrow;
165   b->maxallocmat      = b->sliidx[totalslices];
166   B->info.nz_unneeded = (double)b->maxallocmat;
167   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
168   PetscFunctionReturn(PETSC_SUCCESS);
169 }
170 
171 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
172 {
173   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
174   PetscInt     shift;
175 
176   PetscFunctionBegin;
177   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
178   if (nz) *nz = a->rlen[row];
179   shift = a->sliidx[row >> 3] + (row & 0x07);
180   if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals));
181   if (idx) {
182     PetscInt j;
183     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j];
184     *idx = a->getrowcols;
185   }
186   if (v) {
187     PetscInt j;
188     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j];
189     *v = a->getrowvals;
190   }
191   PetscFunctionReturn(PETSC_SUCCESS);
192 }
193 
194 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
195 {
196   PetscFunctionBegin;
197   PetscFunctionReturn(PETSC_SUCCESS);
198 }
199 
200 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
201 {
202   Mat          B;
203   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
204   PetscInt     i;
205 
206   PetscFunctionBegin;
207   if (reuse == MAT_REUSE_MATRIX) {
208     B = *newmat;
209     PetscCall(MatZeroEntries(B));
210   } else {
211     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
212     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
213     PetscCall(MatSetType(B, MATSEQAIJ));
214     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
215   }
216 
217   for (i = 0; i < A->rmap->n; i++) {
218     PetscInt     nz = 0, *cols = NULL;
219     PetscScalar *vals = NULL;
220 
221     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
222     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
223     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
224   }
225 
226   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
227   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
228   B->rmap->bs = A->rmap->bs;
229 
230   if (reuse == MAT_INPLACE_MATRIX) {
231     PetscCall(MatHeaderReplace(A, &B));
232   } else {
233     *newmat = B;
234   }
235   PetscFunctionReturn(PETSC_SUCCESS);
236 }
237 
238 #include <../src/mat/impls/aij/seq/aij.h>
239 
240 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
241 {
242   Mat                B;
243   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
244   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
245   const PetscInt    *cols;
246   const PetscScalar *vals;
247 
248   PetscFunctionBegin;
249 
250   if (reuse == MAT_REUSE_MATRIX) {
251     B = *newmat;
252   } else {
253     if (PetscDefined(USE_DEBUG) || !a->ilen) {
254       PetscCall(PetscMalloc1(m, &rowlengths));
255       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
256     }
257     if (PetscDefined(USE_DEBUG) && a->ilen) {
258       PetscBool eq;
259       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
260       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
261       PetscCall(PetscFree(rowlengths));
262       rowlengths = a->ilen;
263     } else if (a->ilen) rowlengths = a->ilen;
264     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
265     PetscCall(MatSetSizes(B, m, n, m, n));
266     PetscCall(MatSetType(B, MATSEQSELL));
267     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
268     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
269   }
270 
271   for (row = 0; row < m; row++) {
272     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
273     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
274     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
275   }
276   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
277   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
278   B->rmap->bs = A->rmap->bs;
279 
280   if (reuse == MAT_INPLACE_MATRIX) {
281     PetscCall(MatHeaderReplace(A, &B));
282   } else {
283     *newmat = B;
284   }
285   PetscFunctionReturn(PETSC_SUCCESS);
286 }
287 
288 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
289 {
290   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
291   PetscScalar       *y;
292   const PetscScalar *x;
293   const MatScalar   *aval        = a->val;
294   PetscInt           totalslices = a->totalslices;
295   const PetscInt    *acolidx     = a->colidx;
296   PetscInt           i, j;
297 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
298   __m512d  vec_x, vec_y, vec_vals;
299   __m256i  vec_idx;
300   __mmask8 mask;
301   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
302   __m256i  vec_idx2, vec_idx3, vec_idx4;
303 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
304   __m128i   vec_idx;
305   __m256d   vec_x, vec_y, vec_y2, vec_vals;
306   MatScalar yval;
307   PetscInt  r, rows_left, row, nnz_in_row;
308 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
309   __m128d   vec_x_tmp;
310   __m256d   vec_x, vec_y, vec_y2, vec_vals;
311   MatScalar yval;
312   PetscInt  r, rows_left, row, nnz_in_row;
313 #else
314   PetscScalar sum[8];
315 #endif
316 
317 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
318   #pragma disjoint(*x, *y, *aval)
319 #endif
320 
321   PetscFunctionBegin;
322   PetscCall(VecGetArrayRead(xx, &x));
323   PetscCall(VecGetArray(yy, &y));
324 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
325   for (i = 0; i < totalslices; i++) { /* loop over slices */
326     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
327     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
328 
329     vec_y  = _mm512_setzero_pd();
330     vec_y2 = _mm512_setzero_pd();
331     vec_y3 = _mm512_setzero_pd();
332     vec_y4 = _mm512_setzero_pd();
333 
334     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
335     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
336     case 3:
337       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
338       acolidx += 8;
339       aval += 8;
340       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
341       acolidx += 8;
342       aval += 8;
343       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
344       acolidx += 8;
345       aval += 8;
346       j += 3;
347       break;
348     case 2:
349       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
350       acolidx += 8;
351       aval += 8;
352       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
353       acolidx += 8;
354       aval += 8;
355       j += 2;
356       break;
357     case 1:
358       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
359       acolidx += 8;
360       aval += 8;
361       j += 1;
362       break;
363     }
364   #pragma novector
365     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
366       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
367       acolidx += 8;
368       aval += 8;
369       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
370       acolidx += 8;
371       aval += 8;
372       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
373       acolidx += 8;
374       aval += 8;
375       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
376       acolidx += 8;
377       aval += 8;
378     }
379 
380     vec_y = _mm512_add_pd(vec_y, vec_y2);
381     vec_y = _mm512_add_pd(vec_y, vec_y3);
382     vec_y = _mm512_add_pd(vec_y, vec_y4);
383     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
384       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
385       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
386     } else {
387       _mm512_storeu_pd(&y[8 * i], vec_y);
388     }
389   }
390 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
391   for (i = 0; i < totalslices; i++) { /* loop over full slices */
392     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
393     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
394 
395     /* last slice may have padding rows. Don't use vectorization. */
396     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
397       rows_left = A->rmap->n - 8 * i;
398       for (r = 0; r < rows_left; ++r) {
399         yval       = (MatScalar)0;
400         row        = 8 * i + r;
401         nnz_in_row = a->rlen[row];
402         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
403         y[row] = yval;
404       }
405       break;
406     }
407 
408     vec_y  = _mm256_setzero_pd();
409     vec_y2 = _mm256_setzero_pd();
410 
411   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
412   #pragma novector
413   #pragma unroll(2)
414     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
415       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
416       aval += 4;
417       acolidx += 4;
418       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
419       aval += 4;
420       acolidx += 4;
421     }
422 
423     _mm256_storeu_pd(y + i * 8, vec_y);
424     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
425   }
426 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
427   for (i = 0; i < totalslices; i++) { /* loop over full slices */
428     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
429     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
430 
431     vec_y  = _mm256_setzero_pd();
432     vec_y2 = _mm256_setzero_pd();
433 
434     /* last slice may have padding rows. Don't use vectorization. */
435     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
436       rows_left = A->rmap->n - 8 * i;
437       for (r = 0; r < rows_left; ++r) {
438         yval       = (MatScalar)0;
439         row        = 8 * i + r;
440         nnz_in_row = a->rlen[row];
441         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
442         y[row] = yval;
443       }
444       break;
445     }
446 
447   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
448   #pragma novector
449   #pragma unroll(2)
450     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
451       vec_vals  = _mm256_loadu_pd(aval);
452       vec_x_tmp = _mm_setzero_pd();
453       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
454       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
455       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
456       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
457       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
458       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
459       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
460       aval += 4;
461 
462       vec_vals  = _mm256_loadu_pd(aval);
463       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
464       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
465       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
466       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
467       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
468       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
469       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
470       aval += 4;
471     }
472 
473     _mm256_storeu_pd(y + i * 8, vec_y);
474     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
475   }
476 #else
477   for (i = 0; i < totalslices; i++) { /* loop over slices */
478     for (j = 0; j < 8; j++) sum[j] = 0.0;
479     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
480       sum[0] += aval[j] * x[acolidx[j]];
481       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
482       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
483       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
484       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
485       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
486       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
487       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
488     }
489     if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
490       for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j];
491     } else {
492       for (j = 0; j < 8; j++) y[8 * i + j] = sum[j];
493     }
494   }
495 #endif
496 
497   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
498   PetscCall(VecRestoreArrayRead(xx, &x));
499   PetscCall(VecRestoreArray(yy, &y));
500   PetscFunctionReturn(PETSC_SUCCESS);
501 }
502 
503 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
504 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
505 {
506   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
507   PetscScalar       *y, *z;
508   const PetscScalar *x;
509   const MatScalar   *aval        = a->val;
510   PetscInt           totalslices = a->totalslices;
511   const PetscInt    *acolidx     = a->colidx;
512   PetscInt           i, j;
513 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
514   __m512d  vec_x, vec_y, vec_vals;
515   __m256i  vec_idx;
516   __mmask8 mask;
517   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
518   __m256i  vec_idx2, vec_idx3, vec_idx4;
519 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
520   __m128d   vec_x_tmp;
521   __m256d   vec_x, vec_y, vec_y2, vec_vals;
522   MatScalar yval;
523   PetscInt  r, row, nnz_in_row;
524 #else
525   PetscScalar sum[8];
526 #endif
527 
528 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
529   #pragma disjoint(*x, *y, *aval)
530 #endif
531 
532   PetscFunctionBegin;
533   PetscCall(VecGetArrayRead(xx, &x));
534   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
535 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
536   for (i = 0; i < totalslices; i++) { /* loop over slices */
537     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
538     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
539 
540     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
541       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
542       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
543     } else {
544       vec_y = _mm512_loadu_pd(&y[8 * i]);
545     }
546     vec_y2 = _mm512_setzero_pd();
547     vec_y3 = _mm512_setzero_pd();
548     vec_y4 = _mm512_setzero_pd();
549 
550     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
551     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
552     case 3:
553       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
554       acolidx += 8;
555       aval += 8;
556       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
557       acolidx += 8;
558       aval += 8;
559       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
560       acolidx += 8;
561       aval += 8;
562       j += 3;
563       break;
564     case 2:
565       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
566       acolidx += 8;
567       aval += 8;
568       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
569       acolidx += 8;
570       aval += 8;
571       j += 2;
572       break;
573     case 1:
574       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
575       acolidx += 8;
576       aval += 8;
577       j += 1;
578       break;
579     }
580   #pragma novector
581     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
582       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
583       acolidx += 8;
584       aval += 8;
585       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
586       acolidx += 8;
587       aval += 8;
588       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
589       acolidx += 8;
590       aval += 8;
591       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
592       acolidx += 8;
593       aval += 8;
594     }
595 
596     vec_y = _mm512_add_pd(vec_y, vec_y2);
597     vec_y = _mm512_add_pd(vec_y, vec_y3);
598     vec_y = _mm512_add_pd(vec_y, vec_y4);
599     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
600       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
601     } else {
602       _mm512_storeu_pd(&z[8 * i], vec_y);
603     }
604   }
605 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
606   for (i = 0; i < totalslices; i++) { /* loop over full slices */
607     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
608     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
609 
610     /* last slice may have padding rows. Don't use vectorization. */
611     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
612       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
613         row        = 8 * i + r;
614         yval       = (MatScalar)0.0;
615         nnz_in_row = a->rlen[row];
616         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
617         z[row] = y[row] + yval;
618       }
619       break;
620     }
621 
622     vec_y  = _mm256_loadu_pd(y + 8 * i);
623     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
624 
625     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
626     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
627       vec_vals  = _mm256_loadu_pd(aval);
628       vec_x_tmp = _mm_setzero_pd();
629       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
630       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
631       vec_x     = _mm256_setzero_pd();
632       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
633       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
634       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
635       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
636       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
637       aval += 4;
638 
639       vec_vals  = _mm256_loadu_pd(aval);
640       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
641       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
642       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
643       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
644       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
645       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
646       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
647       aval += 4;
648     }
649 
650     _mm256_storeu_pd(z + i * 8, vec_y);
651     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
652   }
653 #else
654   for (i = 0; i < totalslices; i++) { /* loop over slices */
655     for (j = 0; j < 8; j++) sum[j] = 0.0;
656     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
657       sum[0] += aval[j] * x[acolidx[j]];
658       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
659       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
660       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
661       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
662       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
663       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
664       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
665     }
666     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
667       for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j];
668     } else {
669       for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j];
670     }
671   }
672 #endif
673 
674   PetscCall(PetscLogFlops(2.0 * a->nz));
675   PetscCall(VecRestoreArrayRead(xx, &x));
676   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
677   PetscFunctionReturn(PETSC_SUCCESS);
678 }
679 
680 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
681 {
682   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
683   PetscScalar       *y;
684   const PetscScalar *x;
685   const MatScalar   *aval    = a->val;
686   const PetscInt    *acolidx = a->colidx;
687   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices;
688 
689 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
690   #pragma disjoint(*x, *y, *aval)
691 #endif
692 
693   PetscFunctionBegin;
694   if (A->symmetric == PETSC_BOOL3_TRUE) {
695     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
696     PetscFunctionReturn(PETSC_SUCCESS);
697   }
698   if (zz != yy) PetscCall(VecCopy(zz, yy));
699   PetscCall(VecGetArrayRead(xx, &x));
700   PetscCall(VecGetArray(yy, &y));
701   for (i = 0; i < a->totalslices; i++) { /* loop over slices */
702     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
703       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
704         row        = 8 * i + r;
705         nnz_in_row = a->rlen[row];
706         for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row];
707       }
708       break;
709     }
710     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
711       y[acolidx[j]] += aval[j] * x[8 * i];
712       y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1];
713       y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2];
714       y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3];
715       y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4];
716       y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5];
717       y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6];
718       y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7];
719     }
720   }
721   PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices]));
722   PetscCall(VecRestoreArrayRead(xx, &x));
723   PetscCall(VecRestoreArray(yy, &y));
724   PetscFunctionReturn(PETSC_SUCCESS);
725 }
726 
727 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
728 {
729   PetscFunctionBegin;
730   if (A->symmetric == PETSC_BOOL3_TRUE) {
731     PetscCall(MatMult_SeqSELL(A, xx, yy));
732   } else {
733     PetscCall(VecSet(yy, 0.0));
734     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
735   }
736   PetscFunctionReturn(PETSC_SUCCESS);
737 }
738 
739 /*
740      Checks for missing diagonals
741 */
742 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
743 {
744   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
745   PetscInt    *diag, i;
746 
747   PetscFunctionBegin;
748   *missing = PETSC_FALSE;
749   if (A->rmap->n > 0 && !(a->colidx)) {
750     *missing = PETSC_TRUE;
751     if (d) *d = 0;
752     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
753   } else {
754     diag = a->diag;
755     for (i = 0; i < A->rmap->n; i++) {
756       if (diag[i] == -1) {
757         *missing = PETSC_TRUE;
758         if (d) *d = i;
759         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
760         break;
761       }
762     }
763   }
764   PetscFunctionReturn(PETSC_SUCCESS);
765 }
766 
767 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
768 {
769   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
770   PetscInt     i, j, m = A->rmap->n, shift;
771 
772   PetscFunctionBegin;
773   if (!a->diag) {
774     PetscCall(PetscMalloc1(m, &a->diag));
775     a->free_diag = PETSC_TRUE;
776   }
777   for (i = 0; i < m; i++) {                      /* loop over rows */
778     shift      = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
779     a->diag[i] = -1;
780     for (j = 0; j < a->rlen[i]; j++) {
781       if (a->colidx[shift + j * 8] == i) {
782         a->diag[i] = shift + j * 8;
783         break;
784       }
785     }
786   }
787   PetscFunctionReturn(PETSC_SUCCESS);
788 }
789 
790 /*
791   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
792 */
793 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
794 {
795   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
796   PetscInt     i, *diag, m = A->rmap->n;
797   MatScalar   *val = a->val;
798   PetscScalar *idiag, *mdiag;
799 
800   PetscFunctionBegin;
801   if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
802   PetscCall(MatMarkDiagonal_SeqSELL(A));
803   diag = a->diag;
804   if (!a->idiag) {
805     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
806     val = a->val;
807   }
808   mdiag = a->mdiag;
809   idiag = a->idiag;
810 
811   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
812     for (i = 0; i < m; i++) {
813       mdiag[i] = val[diag[i]];
814       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
815         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
816         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
817         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
818         A->factorerror_zeropivot_value = 0.0;
819         A->factorerror_zeropivot_row   = i;
820       }
821       idiag[i] = 1.0 / val[diag[i]];
822     }
823     PetscCall(PetscLogFlops(m));
824   } else {
825     for (i = 0; i < m; i++) {
826       mdiag[i] = val[diag[i]];
827       idiag[i] = omega / (fshift + val[diag[i]]);
828     }
829     PetscCall(PetscLogFlops(2.0 * m));
830   }
831   a->idiagvalid = PETSC_TRUE;
832   PetscFunctionReturn(PETSC_SUCCESS);
833 }
834 
835 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
836 {
837   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
838 
839   PetscFunctionBegin;
840   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
841   PetscCall(MatSeqSELLInvalidateDiagonal(A));
842   PetscFunctionReturn(PETSC_SUCCESS);
843 }
844 
845 PetscErrorCode MatDestroy_SeqSELL(Mat A)
846 {
847   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
848 
849   PetscFunctionBegin;
850 #if defined(PETSC_USE_LOG)
851   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
852 #endif
853   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
854   PetscCall(ISDestroy(&a->row));
855   PetscCall(ISDestroy(&a->col));
856   PetscCall(PetscFree(a->diag));
857   PetscCall(PetscFree(a->rlen));
858   PetscCall(PetscFree(a->sliidx));
859   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
860   PetscCall(PetscFree(a->solve_work));
861   PetscCall(ISDestroy(&a->icol));
862   PetscCall(PetscFree(a->saved_values));
863   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
864 
865   PetscCall(PetscFree(A->data));
866 
867   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
868   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
869   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
870   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
871   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
872   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
873   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
874   PetscFunctionReturn(PETSC_SUCCESS);
875 }
876 
877 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
878 {
879   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
880 
881   PetscFunctionBegin;
882   switch (op) {
883   case MAT_ROW_ORIENTED:
884     a->roworiented = flg;
885     break;
886   case MAT_KEEP_NONZERO_PATTERN:
887     a->keepnonzeropattern = flg;
888     break;
889   case MAT_NEW_NONZERO_LOCATIONS:
890     a->nonew = (flg ? 0 : 1);
891     break;
892   case MAT_NEW_NONZERO_LOCATION_ERR:
893     a->nonew = (flg ? -1 : 0);
894     break;
895   case MAT_NEW_NONZERO_ALLOCATION_ERR:
896     a->nonew = (flg ? -2 : 0);
897     break;
898   case MAT_UNUSED_NONZERO_LOCATION_ERR:
899     a->nounused = (flg ? -1 : 0);
900     break;
901   case MAT_FORCE_DIAGONAL_ENTRIES:
902   case MAT_IGNORE_OFF_PROC_ENTRIES:
903   case MAT_USE_HASH_TABLE:
904   case MAT_SORTED_FULL:
905     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
906     break;
907   case MAT_SPD:
908   case MAT_SYMMETRIC:
909   case MAT_STRUCTURALLY_SYMMETRIC:
910   case MAT_HERMITIAN:
911   case MAT_SYMMETRY_ETERNAL:
912   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
913   case MAT_SPD_ETERNAL:
914     /* These options are handled directly by MatSetOption() */
915     break;
916   default:
917     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
918   }
919   PetscFunctionReturn(PETSC_SUCCESS);
920 }
921 
922 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
923 {
924   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
925   PetscInt     i, j, n, shift;
926   PetscScalar *x, zero = 0.0;
927 
928   PetscFunctionBegin;
929   PetscCall(VecGetLocalSize(v, &n));
930   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
931 
932   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
933     PetscInt *diag = a->diag;
934     PetscCall(VecGetArray(v, &x));
935     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
936     PetscCall(VecRestoreArray(v, &x));
937     PetscFunctionReturn(PETSC_SUCCESS);
938   }
939 
940   PetscCall(VecSet(v, zero));
941   PetscCall(VecGetArray(v, &x));
942   for (i = 0; i < n; i++) {                 /* loop over rows */
943     shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
944     x[i]  = 0;
945     for (j = 0; j < a->rlen[i]; j++) {
946       if (a->colidx[shift + j * 8] == i) {
947         x[i] = a->val[shift + j * 8];
948         break;
949       }
950     }
951   }
952   PetscCall(VecRestoreArray(v, &x));
953   PetscFunctionReturn(PETSC_SUCCESS);
954 }
955 
956 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
957 {
958   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
959   const PetscScalar *l, *r;
960   PetscInt           i, j, m, n, row;
961 
962   PetscFunctionBegin;
963   if (ll) {
964     /* The local size is used so that VecMPI can be passed to this routine
965        by MatDiagonalScale_MPISELL */
966     PetscCall(VecGetLocalSize(ll, &m));
967     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
968     PetscCall(VecGetArrayRead(ll, &l));
969     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
970       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
971         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
972           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row];
973         }
974       } else {
975         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row];
976       }
977     }
978     PetscCall(VecRestoreArrayRead(ll, &l));
979     PetscCall(PetscLogFlops(a->nz));
980   }
981   if (rr) {
982     PetscCall(VecGetLocalSize(rr, &n));
983     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
984     PetscCall(VecGetArrayRead(rr, &r));
985     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
986       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
987         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
988           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
989         }
990       } else {
991         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
992       }
993     }
994     PetscCall(VecRestoreArrayRead(rr, &r));
995     PetscCall(PetscLogFlops(a->nz));
996   }
997   PetscCall(MatSeqSELLInvalidateDiagonal(A));
998   PetscFunctionReturn(PETSC_SUCCESS);
999 }
1000 
1001 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1002 {
1003   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1004   PetscInt    *cp, i, k, low, high, t, row, col, l;
1005   PetscInt     shift;
1006   MatScalar   *vp;
1007 
1008   PetscFunctionBegin;
1009   for (k = 0; k < m; k++) { /* loop over requested rows */
1010     row = im[k];
1011     if (row < 0) continue;
1012     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1013     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1014     cp    = a->colidx + shift;                  /* pointer to the row */
1015     vp    = a->val + shift;                     /* pointer to the row */
1016     for (l = 0; l < n; l++) {                   /* loop over requested columns */
1017       col = in[l];
1018       if (col < 0) continue;
1019       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1020       high = a->rlen[row];
1021       low  = 0; /* assume unsorted */
1022       while (high - low > 5) {
1023         t = (low + high) / 2;
1024         if (*(cp + t * 8) > col) high = t;
1025         else low = t;
1026       }
1027       for (i = low; i < high; i++) {
1028         if (*(cp + 8 * i) > col) break;
1029         if (*(cp + 8 * i) == col) {
1030           *v++ = *(vp + 8 * i);
1031           goto finished;
1032         }
1033       }
1034       *v++ = 0.0;
1035     finished:;
1036     }
1037   }
1038   PetscFunctionReturn(PETSC_SUCCESS);
1039 }
1040 
1041 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1042 {
1043   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1044   PetscInt          i, j, m = A->rmap->n, shift;
1045   const char       *name;
1046   PetscViewerFormat format;
1047 
1048   PetscFunctionBegin;
1049   PetscCall(PetscViewerGetFormat(viewer, &format));
1050   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1051     PetscInt nofinalvalue = 0;
1052     /*
1053     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1054       nofinalvalue = 1;
1055     }
1056     */
1057     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1058     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1059     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1060 #if defined(PETSC_USE_COMPLEX)
1061     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1062 #else
1063     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1064 #endif
1065     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1066 
1067     for (i = 0; i < m; i++) {
1068       shift = a->sliidx[i >> 3] + (i & 0x07);
1069       for (j = 0; j < a->rlen[i]; j++) {
1070 #if defined(PETSC_USE_COMPLEX)
1071         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1072 #else
1073         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j]));
1074 #endif
1075       }
1076     }
1077     /*
1078     if (nofinalvalue) {
1079 #if defined(PETSC_USE_COMPLEX)
1080       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1081 #else
1082       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1083 #endif
1084     }
1085     */
1086     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1087     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1088     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1089   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1090     PetscFunctionReturn(PETSC_SUCCESS);
1091   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1092     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1093     for (i = 0; i < m; i++) {
1094       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1095       shift = a->sliidx[i >> 3] + (i & 0x07);
1096       for (j = 0; j < a->rlen[i]; j++) {
1097 #if defined(PETSC_USE_COMPLEX)
1098         if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1099           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1100         } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1101           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1102         } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1103           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1104         }
1105 #else
1106         if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1107 #endif
1108       }
1109       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1110     }
1111     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1112   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1113     PetscInt    cnt = 0, jcnt;
1114     PetscScalar value;
1115 #if defined(PETSC_USE_COMPLEX)
1116     PetscBool realonly = PETSC_TRUE;
1117     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1118       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1119         realonly = PETSC_FALSE;
1120         break;
1121       }
1122     }
1123 #endif
1124 
1125     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1126     for (i = 0; i < m; i++) {
1127       jcnt  = 0;
1128       shift = a->sliidx[i >> 3] + (i & 0x07);
1129       for (j = 0; j < A->cmap->n; j++) {
1130         if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) {
1131           value = a->val[cnt++];
1132           jcnt++;
1133         } else {
1134           value = 0.0;
1135         }
1136 #if defined(PETSC_USE_COMPLEX)
1137         if (realonly) {
1138           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1139         } else {
1140           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1141         }
1142 #else
1143         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1144 #endif
1145       }
1146       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1147     }
1148     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1149   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1150     PetscInt fshift = 1;
1151     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1152 #if defined(PETSC_USE_COMPLEX)
1153     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1154 #else
1155     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1156 #endif
1157     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1158     for (i = 0; i < m; i++) {
1159       shift = a->sliidx[i >> 3] + (i & 0x07);
1160       for (j = 0; j < a->rlen[i]; j++) {
1161 #if defined(PETSC_USE_COMPLEX)
1162         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1163 #else
1164         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j]));
1165 #endif
1166       }
1167     }
1168     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1169   } else if (format == PETSC_VIEWER_NATIVE) {
1170     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1171       PetscInt row;
1172       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1173       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
1174 #if defined(PETSC_USE_COMPLEX)
1175         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1176           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1177         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1178           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1179         } else {
1180           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1181         }
1182 #else
1183         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j]));
1184 #endif
1185       }
1186     }
1187   } else {
1188     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1189     if (A->factortype) {
1190       for (i = 0; i < m; i++) {
1191         shift = a->sliidx[i >> 3] + (i & 0x07);
1192         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1193         /* L part */
1194         for (j = shift; j < a->diag[i]; j += 8) {
1195 #if defined(PETSC_USE_COMPLEX)
1196           if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) {
1197             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1198           } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) {
1199             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1200           } else {
1201             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1202           }
1203 #else
1204           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1205 #endif
1206         }
1207         /* diagonal */
1208         j = a->diag[i];
1209 #if defined(PETSC_USE_COMPLEX)
1210         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1211           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1212         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1213           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1214         } else {
1215           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1216         }
1217 #else
1218         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1219 #endif
1220 
1221         /* U part */
1222         for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) {
1223 #if defined(PETSC_USE_COMPLEX)
1224           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1225             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1226           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1227             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1228           } else {
1229             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1230           }
1231 #else
1232           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1233 #endif
1234         }
1235         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1236       }
1237     } else {
1238       for (i = 0; i < m; i++) {
1239         shift = a->sliidx[i >> 3] + (i & 0x07);
1240         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1241         for (j = 0; j < a->rlen[i]; j++) {
1242 #if defined(PETSC_USE_COMPLEX)
1243           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1244             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1245           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1246             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1247           } else {
1248             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1249           }
1250 #else
1251           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1252 #endif
1253         }
1254         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1255       }
1256     }
1257     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1258   }
1259   PetscCall(PetscViewerFlush(viewer));
1260   PetscFunctionReturn(PETSC_SUCCESS);
1261 }
1262 
1263 #include <petscdraw.h>
1264 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1265 {
1266   Mat               A = (Mat)Aa;
1267   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1268   PetscInt          i, j, m = A->rmap->n, shift;
1269   int               color;
1270   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1271   PetscViewer       viewer;
1272   PetscViewerFormat format;
1273 
1274   PetscFunctionBegin;
1275   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1276   PetscCall(PetscViewerGetFormat(viewer, &format));
1277   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1278 
1279   /* loop over matrix elements drawing boxes */
1280 
1281   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1282     PetscDrawCollectiveBegin(draw);
1283     /* Blue for negative, Cyan for zero and  Red for positive */
1284     color = PETSC_DRAW_BLUE;
1285     for (i = 0; i < m; i++) {
1286       shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1287       y_l   = m - i - 1.0;
1288       y_r   = y_l + 1.0;
1289       for (j = 0; j < a->rlen[i]; j++) {
1290         x_l = a->colidx[shift + j * 8];
1291         x_r = x_l + 1.0;
1292         if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue;
1293         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1294       }
1295     }
1296     color = PETSC_DRAW_CYAN;
1297     for (i = 0; i < m; i++) {
1298       shift = a->sliidx[i >> 3] + (i & 0x07);
1299       y_l   = m - i - 1.0;
1300       y_r   = y_l + 1.0;
1301       for (j = 0; j < a->rlen[i]; j++) {
1302         x_l = a->colidx[shift + j * 8];
1303         x_r = x_l + 1.0;
1304         if (a->val[shift + 8 * j] != 0.) continue;
1305         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1306       }
1307     }
1308     color = PETSC_DRAW_RED;
1309     for (i = 0; i < m; i++) {
1310       shift = a->sliidx[i >> 3] + (i & 0x07);
1311       y_l   = m - i - 1.0;
1312       y_r   = y_l + 1.0;
1313       for (j = 0; j < a->rlen[i]; j++) {
1314         x_l = a->colidx[shift + j * 8];
1315         x_r = x_l + 1.0;
1316         if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue;
1317         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1318       }
1319     }
1320     PetscDrawCollectiveEnd(draw);
1321   } else {
1322     /* use contour shading to indicate magnitude of values */
1323     /* first determine max of all nonzero values */
1324     PetscReal minv = 0.0, maxv = 0.0;
1325     PetscInt  count = 0;
1326     PetscDraw popup;
1327     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1328       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1329     }
1330     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1331     PetscCall(PetscDrawGetPopup(draw, &popup));
1332     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1333 
1334     PetscDrawCollectiveBegin(draw);
1335     for (i = 0; i < m; i++) {
1336       shift = a->sliidx[i >> 3] + (i & 0x07);
1337       y_l   = m - i - 1.0;
1338       y_r   = y_l + 1.0;
1339       for (j = 0; j < a->rlen[i]; j++) {
1340         x_l   = a->colidx[shift + j * 8];
1341         x_r   = x_l + 1.0;
1342         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1343         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1344         count++;
1345       }
1346     }
1347     PetscDrawCollectiveEnd(draw);
1348   }
1349   PetscFunctionReturn(PETSC_SUCCESS);
1350 }
1351 
1352 #include <petscdraw.h>
1353 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1354 {
1355   PetscDraw draw;
1356   PetscReal xr, yr, xl, yl, h, w;
1357   PetscBool isnull;
1358 
1359   PetscFunctionBegin;
1360   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1361   PetscCall(PetscDrawIsNull(draw, &isnull));
1362   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1363 
1364   xr = A->cmap->n;
1365   yr = A->rmap->n;
1366   h  = yr / 10.0;
1367   w  = xr / 10.0;
1368   xr += w;
1369   yr += h;
1370   xl = -w;
1371   yl = -h;
1372   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1373   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1374   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1375   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1376   PetscCall(PetscDrawSave(draw));
1377   PetscFunctionReturn(PETSC_SUCCESS);
1378 }
1379 
1380 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1381 {
1382   PetscBool iascii, isbinary, isdraw;
1383 
1384   PetscFunctionBegin;
1385   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1386   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1387   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1388   if (iascii) {
1389     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1390   } else if (isbinary) {
1391     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1392   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1393   PetscFunctionReturn(PETSC_SUCCESS);
1394 }
1395 
1396 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1397 {
1398   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1399   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1400   MatScalar   *vp;
1401 
1402   PetscFunctionBegin;
1403   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1404   /* To do: compress out the unused elements */
1405   PetscCall(MatMarkDiagonal_SeqSELL(A));
1406   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1407   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1408   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1409   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1410   for (i = 0; i < a->totalslices; ++i) {
1411     shift = a->sliidx[i];                                      /* starting index of the slice */
1412     cp    = a->colidx + shift;                                 /* pointer to the column indices of the slice */
1413     vp    = a->val + shift;                                    /* pointer to the nonzero values of the slice */
1414     for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */
1415       row  = 8 * i + row_in_slice;
1416       nrow = a->rlen[row]; /* number of nonzeros in row */
1417       /*
1418         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1419         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1420       */
1421       lastcol = 0;
1422       if (nrow > 0) {                                /* nonempty row */
1423         lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1424       } else if (!row_in_slice) {                    /* first row of the correct slice is empty */
1425         for (j = 1; j < 8; j++) {
1426           if (a->rlen[8 * i + j]) {
1427             lastcol = cp[j];
1428             break;
1429           }
1430         }
1431       } else {
1432         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1433       }
1434 
1435       for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) {
1436         cp[8 * k + row_in_slice] = lastcol;
1437         vp[8 * k + row_in_slice] = (MatScalar)0;
1438       }
1439     }
1440   }
1441 
1442   A->info.mallocs += a->reallocs;
1443   a->reallocs = 0;
1444 
1445   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1446   PetscFunctionReturn(PETSC_SUCCESS);
1447 }
1448 
1449 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1450 {
1451   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1452 
1453   PetscFunctionBegin;
1454   info->block_size   = 1.0;
1455   info->nz_allocated = a->maxallocmat;
1456   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1457   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1458   info->assemblies   = A->num_ass;
1459   info->mallocs      = A->info.mallocs;
1460   info->memory       = 0; /* REVIEW ME */
1461   if (A->factortype) {
1462     info->fill_ratio_given  = A->info.fill_ratio_given;
1463     info->fill_ratio_needed = A->info.fill_ratio_needed;
1464     info->factor_mallocs    = A->info.factor_mallocs;
1465   } else {
1466     info->fill_ratio_given  = 0;
1467     info->fill_ratio_needed = 0;
1468     info->factor_mallocs    = 0;
1469   }
1470   PetscFunctionReturn(PETSC_SUCCESS);
1471 }
1472 
1473 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1474 {
1475   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1476   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1477   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1478   MatScalar   *vp, value;
1479 
1480   PetscFunctionBegin;
1481   for (k = 0; k < m; k++) { /* loop over added rows */
1482     row = im[k];
1483     if (row < 0) continue;
1484     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1485     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1486     cp    = a->colidx + shift;                  /* pointer to the row */
1487     vp    = a->val + shift;                     /* pointer to the row */
1488     nrow  = a->rlen[row];
1489     low   = 0;
1490     high  = nrow;
1491 
1492     for (l = 0; l < n; l++) { /* loop over added columns */
1493       col = in[l];
1494       if (col < 0) continue;
1495       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1496       if (a->roworiented) {
1497         value = v[l + k * n];
1498       } else {
1499         value = v[k + l * m];
1500       }
1501       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1502 
1503       /* search in this row for the specified column, i indicates the column to be set */
1504       if (col <= lastcol) low = 0;
1505       else high = nrow;
1506       lastcol = col;
1507       while (high - low > 5) {
1508         t = (low + high) / 2;
1509         if (*(cp + t * 8) > col) high = t;
1510         else low = t;
1511       }
1512       for (i = low; i < high; i++) {
1513         if (*(cp + i * 8) > col) break;
1514         if (*(cp + i * 8) == col) {
1515           if (is == ADD_VALUES) *(vp + i * 8) += value;
1516           else *(vp + i * 8) = value;
1517           low = i + 1;
1518           goto noinsert;
1519         }
1520       }
1521       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1522       if (nonew == 1) goto noinsert;
1523       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1524       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1525       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar);
1526       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1527       for (ii = nrow - 1; ii >= i; ii--) {
1528         *(cp + (ii + 1) * 8) = *(cp + ii * 8);
1529         *(vp + (ii + 1) * 8) = *(vp + ii * 8);
1530       }
1531       a->rlen[row]++;
1532       *(cp + i * 8) = col;
1533       *(vp + i * 8) = value;
1534       a->nz++;
1535       A->nonzerostate++;
1536       low = i + 1;
1537       high++;
1538       nrow++;
1539     noinsert:;
1540     }
1541     a->rlen[row] = nrow;
1542   }
1543   PetscFunctionReturn(PETSC_SUCCESS);
1544 }
1545 
1546 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1547 {
1548   PetscFunctionBegin;
1549   /* If the two matrices have the same copy implementation, use fast copy. */
1550   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1551     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1552     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1553 
1554     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1555     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1556   } else {
1557     PetscCall(MatCopy_Basic(A, B, str));
1558   }
1559   PetscFunctionReturn(PETSC_SUCCESS);
1560 }
1561 
1562 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1563 {
1564   PetscFunctionBegin;
1565   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1566   PetscFunctionReturn(PETSC_SUCCESS);
1567 }
1568 
1569 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1570 {
1571   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1572 
1573   PetscFunctionBegin;
1574   *array = a->val;
1575   PetscFunctionReturn(PETSC_SUCCESS);
1576 }
1577 
1578 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1579 {
1580   PetscFunctionBegin;
1581   PetscFunctionReturn(PETSC_SUCCESS);
1582 }
1583 
1584 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1585 {
1586   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1587   PetscInt     i;
1588   MatScalar   *aval = a->val;
1589 
1590   PetscFunctionBegin;
1591   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1592   PetscFunctionReturn(PETSC_SUCCESS);
1593 }
1594 
1595 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1596 {
1597   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1598   PetscInt     i;
1599   MatScalar   *aval = a->val;
1600 
1601   PetscFunctionBegin;
1602   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1603   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1604   PetscFunctionReturn(PETSC_SUCCESS);
1605 }
1606 
1607 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1608 {
1609   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1610   MatScalar   *aval   = a->val;
1611   PetscScalar  oalpha = alpha;
1612   PetscBLASInt one    = 1, size;
1613 
1614   PetscFunctionBegin;
1615   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1616   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1617   PetscCall(PetscLogFlops(a->nz));
1618   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1619   PetscFunctionReturn(PETSC_SUCCESS);
1620 }
1621 
1622 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1623 {
1624   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1625 
1626   PetscFunctionBegin;
1627   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1628   PetscCall(MatShift_Basic(Y, a));
1629   PetscFunctionReturn(PETSC_SUCCESS);
1630 }
1631 
1632 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1633 {
1634   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1635   PetscScalar       *x, sum, *t;
1636   const MatScalar   *idiag = NULL, *mdiag;
1637   const PetscScalar *b, *xb;
1638   PetscInt           n, m = A->rmap->n, i, j, shift;
1639   const PetscInt    *diag;
1640 
1641   PetscFunctionBegin;
1642   its = its * lits;
1643 
1644   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1645   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1646   a->fshift = fshift;
1647   a->omega  = omega;
1648 
1649   diag  = a->diag;
1650   t     = a->ssor_work;
1651   idiag = a->idiag;
1652   mdiag = a->mdiag;
1653 
1654   PetscCall(VecGetArray(xx, &x));
1655   PetscCall(VecGetArrayRead(bb, &b));
1656   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1657   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1658   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1659   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1660 
1661   if (flag & SOR_ZERO_INITIAL_GUESS) {
1662     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1663       for (i = 0; i < m; i++) {
1664         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1665         sum   = b[i];
1666         n     = (diag[i] - shift) / 8;
1667         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1668         t[i] = sum;
1669         x[i] = sum * idiag[i];
1670       }
1671       xb = t;
1672       PetscCall(PetscLogFlops(a->nz));
1673     } else xb = b;
1674     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1675       for (i = m - 1; i >= 0; i--) {
1676         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1677         sum   = xb[i];
1678         n     = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1679         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1680         if (xb == b) {
1681           x[i] = sum * idiag[i];
1682         } else {
1683           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1684         }
1685       }
1686       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1687     }
1688     its--;
1689   }
1690   while (its--) {
1691     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1692       for (i = 0; i < m; i++) {
1693         /* lower */
1694         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1695         sum   = b[i];
1696         n     = (diag[i] - shift) / 8;
1697         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1698         t[i] = sum; /* save application of the lower-triangular part */
1699         /* upper */
1700         n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1701         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1702         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1703       }
1704       xb = t;
1705       PetscCall(PetscLogFlops(2.0 * a->nz));
1706     } else xb = b;
1707     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1708       for (i = m - 1; i >= 0; i--) {
1709         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1710         sum   = xb[i];
1711         if (xb == b) {
1712           /* whole matrix (no checkpointing available) */
1713           n = a->rlen[i];
1714           for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1715           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1716         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1717           n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1718           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1719           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1720         }
1721       }
1722       if (xb == b) {
1723         PetscCall(PetscLogFlops(2.0 * a->nz));
1724       } else {
1725         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1726       }
1727     }
1728   }
1729   PetscCall(VecRestoreArray(xx, &x));
1730   PetscCall(VecRestoreArrayRead(bb, &b));
1731   PetscFunctionReturn(PETSC_SUCCESS);
1732 }
1733 
1734 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1735                                        MatGetRow_SeqSELL,
1736                                        MatRestoreRow_SeqSELL,
1737                                        MatMult_SeqSELL,
1738                                        /* 4*/ MatMultAdd_SeqSELL,
1739                                        MatMultTranspose_SeqSELL,
1740                                        MatMultTransposeAdd_SeqSELL,
1741                                        NULL,
1742                                        NULL,
1743                                        NULL,
1744                                        /* 10*/ NULL,
1745                                        NULL,
1746                                        NULL,
1747                                        MatSOR_SeqSELL,
1748                                        NULL,
1749                                        /* 15*/ MatGetInfo_SeqSELL,
1750                                        MatEqual_SeqSELL,
1751                                        MatGetDiagonal_SeqSELL,
1752                                        MatDiagonalScale_SeqSELL,
1753                                        NULL,
1754                                        /* 20*/ NULL,
1755                                        MatAssemblyEnd_SeqSELL,
1756                                        MatSetOption_SeqSELL,
1757                                        MatZeroEntries_SeqSELL,
1758                                        /* 24*/ NULL,
1759                                        NULL,
1760                                        NULL,
1761                                        NULL,
1762                                        NULL,
1763                                        /* 29*/ MatSetUp_SeqSELL,
1764                                        NULL,
1765                                        NULL,
1766                                        NULL,
1767                                        NULL,
1768                                        /* 34*/ MatDuplicate_SeqSELL,
1769                                        NULL,
1770                                        NULL,
1771                                        NULL,
1772                                        NULL,
1773                                        /* 39*/ NULL,
1774                                        NULL,
1775                                        NULL,
1776                                        MatGetValues_SeqSELL,
1777                                        MatCopy_SeqSELL,
1778                                        /* 44*/ NULL,
1779                                        MatScale_SeqSELL,
1780                                        MatShift_SeqSELL,
1781                                        NULL,
1782                                        NULL,
1783                                        /* 49*/ NULL,
1784                                        NULL,
1785                                        NULL,
1786                                        NULL,
1787                                        NULL,
1788                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1789                                        NULL,
1790                                        NULL,
1791                                        NULL,
1792                                        NULL,
1793                                        /* 59*/ NULL,
1794                                        MatDestroy_SeqSELL,
1795                                        MatView_SeqSELL,
1796                                        NULL,
1797                                        NULL,
1798                                        /* 64*/ NULL,
1799                                        NULL,
1800                                        NULL,
1801                                        NULL,
1802                                        NULL,
1803                                        /* 69*/ NULL,
1804                                        NULL,
1805                                        NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        /* 74*/ NULL,
1809                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1810                                        NULL,
1811                                        NULL,
1812                                        NULL,
1813                                        /* 79*/ NULL,
1814                                        NULL,
1815                                        NULL,
1816                                        NULL,
1817                                        NULL,
1818                                        /* 84*/ NULL,
1819                                        NULL,
1820                                        NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        /* 89*/ NULL,
1824                                        NULL,
1825                                        NULL,
1826                                        NULL,
1827                                        NULL,
1828                                        /* 94*/ NULL,
1829                                        NULL,
1830                                        NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        /* 99*/ NULL,
1834                                        NULL,
1835                                        NULL,
1836                                        MatConjugate_SeqSELL,
1837                                        NULL,
1838                                        /*104*/ NULL,
1839                                        NULL,
1840                                        NULL,
1841                                        NULL,
1842                                        NULL,
1843                                        /*109*/ NULL,
1844                                        NULL,
1845                                        NULL,
1846                                        NULL,
1847                                        MatMissingDiagonal_SeqSELL,
1848                                        /*114*/ NULL,
1849                                        NULL,
1850                                        NULL,
1851                                        NULL,
1852                                        NULL,
1853                                        /*119*/ NULL,
1854                                        NULL,
1855                                        NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        /*124*/ NULL,
1859                                        NULL,
1860                                        NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        /*129*/ NULL,
1864                                        NULL,
1865                                        NULL,
1866                                        NULL,
1867                                        NULL,
1868                                        /*134*/ NULL,
1869                                        NULL,
1870                                        NULL,
1871                                        NULL,
1872                                        NULL,
1873                                        /*139*/ NULL,
1874                                        NULL,
1875                                        NULL,
1876                                        MatFDColoringSetUp_SeqXAIJ,
1877                                        NULL,
1878                                        /*144*/ NULL,
1879                                        NULL,
1880                                        NULL,
1881                                        NULL,
1882                                        NULL,
1883                                        NULL,
1884                                        /*150*/ NULL,
1885                                        NULL};
1886 
1887 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1888 {
1889   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1890 
1891   PetscFunctionBegin;
1892   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1893 
1894   /* allocate space for values if not already there */
1895   if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1896 
1897   /* copy values over */
1898   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1899   PetscFunctionReturn(PETSC_SUCCESS);
1900 }
1901 
1902 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1903 {
1904   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1905 
1906   PetscFunctionBegin;
1907   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1908   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1909   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1910   PetscFunctionReturn(PETSC_SUCCESS);
1911 }
1912 
1913 /*@C
1914  MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
1915 
1916  Not Collective
1917 
1918  Input Parameters:
1919 +  mat - a `MATSEQSELL` matrix
1920 -  array - pointer to the data
1921 
1922  Level: intermediate
1923 
1924 .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1925  @*/
1926 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
1927 {
1928   PetscFunctionBegin;
1929   PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
1930   PetscFunctionReturn(PETSC_SUCCESS);
1931 }
1932 
1933 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1934 {
1935   Mat_SeqSELL *b;
1936   PetscMPIInt  size;
1937 
1938   PetscFunctionBegin;
1939   PetscCall(PetscCitationsRegister(citation, &cited));
1940   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
1941   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
1942 
1943   PetscCall(PetscNew(&b));
1944 
1945   B->data = (void *)b;
1946 
1947   PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
1948 
1949   b->row                = NULL;
1950   b->col                = NULL;
1951   b->icol               = NULL;
1952   b->reallocs           = 0;
1953   b->ignorezeroentries  = PETSC_FALSE;
1954   b->roworiented        = PETSC_TRUE;
1955   b->nonew              = 0;
1956   b->diag               = NULL;
1957   b->solve_work         = NULL;
1958   B->spptr              = NULL;
1959   b->saved_values       = NULL;
1960   b->idiag              = NULL;
1961   b->mdiag              = NULL;
1962   b->ssor_work          = NULL;
1963   b->omega              = 1.0;
1964   b->fshift             = 0.0;
1965   b->idiagvalid         = PETSC_FALSE;
1966   b->keepnonzeropattern = PETSC_FALSE;
1967 
1968   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
1969   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
1970   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
1971   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
1972   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
1973   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
1974   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
1975   PetscFunctionReturn(PETSC_SUCCESS);
1976 }
1977 
1978 /*
1979  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1980  */
1981 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
1982 {
1983   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
1984   PetscInt     i, m                           = A->rmap->n;
1985   PetscInt     totalslices = a->totalslices;
1986 
1987   PetscFunctionBegin;
1988   C->factortype = A->factortype;
1989   c->row        = NULL;
1990   c->col        = NULL;
1991   c->icol       = NULL;
1992   c->reallocs   = 0;
1993   C->assembled  = PETSC_TRUE;
1994 
1995   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
1996   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
1997 
1998   PetscCall(PetscMalloc1(8 * totalslices, &c->rlen));
1999   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2000 
2001   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2002   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2003 
2004   /* allocate the matrix space */
2005   if (mallocmatspace) {
2006     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2007 
2008     c->singlemalloc = PETSC_TRUE;
2009 
2010     if (m > 0) {
2011       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2012       if (cpvalues == MAT_COPY_VALUES) {
2013         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2014       } else {
2015         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2016       }
2017     }
2018   }
2019 
2020   c->ignorezeroentries = a->ignorezeroentries;
2021   c->roworiented       = a->roworiented;
2022   c->nonew             = a->nonew;
2023   if (a->diag) {
2024     PetscCall(PetscMalloc1(m, &c->diag));
2025     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2026   } else c->diag = NULL;
2027 
2028   c->solve_work         = NULL;
2029   c->saved_values       = NULL;
2030   c->idiag              = NULL;
2031   c->ssor_work          = NULL;
2032   c->keepnonzeropattern = a->keepnonzeropattern;
2033   c->free_val           = PETSC_TRUE;
2034   c->free_colidx        = PETSC_TRUE;
2035 
2036   c->maxallocmat  = a->maxallocmat;
2037   c->maxallocrow  = a->maxallocrow;
2038   c->rlenmax      = a->rlenmax;
2039   c->nz           = a->nz;
2040   C->preallocated = PETSC_TRUE;
2041 
2042   c->nonzerorowcnt = a->nonzerorowcnt;
2043   C->nonzerostate  = A->nonzerostate;
2044 
2045   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2046   PetscFunctionReturn(PETSC_SUCCESS);
2047 }
2048 
2049 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2050 {
2051   PetscFunctionBegin;
2052   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2053   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2054   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2055   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2056   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2057   PetscFunctionReturn(PETSC_SUCCESS);
2058 }
2059 
2060 /*MC
2061    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2062    based on the sliced Ellpack format
2063 
2064    Options Database Key:
2065 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2066 
2067    Level: beginner
2068 
2069 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2070 M*/
2071 
2072 /*MC
2073    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2074 
2075    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2076    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2077   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2078   for communicators controlling multiple processes.  It is recommended that you call both of
2079   the above preallocation routines for simplicity.
2080 
2081    Options Database Key:
2082 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2083 
2084   Level: beginner
2085 
2086   Notes:
2087    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2088 
2089    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2090    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2091 
2092   Developer Notes:
2093    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2094 
2095    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2096 .vb
2097                             (2 0  3 4)
2098    Consider the matrix A =  (5 0  6 0)
2099                             (0 0  7 8)
2100                             (0 0  9 9)
2101 
2102    symbolically the Ellpack format can be written as
2103 
2104         (2 3 4 |)           (0 2 3 |)
2105    v =  (5 6 0 |)  colidx = (0 2 2 |)
2106         --------            ---------
2107         (7 8 |)             (2 3 |)
2108         (9 9 |)             (2 3 |)
2109 
2110     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2111     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2112     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2113 
2114     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2115 
2116 .ve
2117 
2118       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2119 
2120  References:
2121 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2122    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2123 
2124 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2125 M*/
2126 
2127 /*@C
2128        MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2129 
2130  Collective
2131 
2132  Input Parameters:
2133 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
2134 .  m - number of rows
2135 .  n - number of columns
2136 .  rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2137 -  rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2138 
2139  Output Parameter:
2140 .  A - the matrix
2141 
2142  Level: intermediate
2143 
2144  Notes:
2145  It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2146  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2147  [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2148 
2149  Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2150  Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2151  allocation.
2152 
2153 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2154  @*/
2155 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2156 {
2157   PetscFunctionBegin;
2158   PetscCall(MatCreate(comm, A));
2159   PetscCall(MatSetSizes(*A, m, n, m, n));
2160   PetscCall(MatSetType(*A, MATSEQSELL));
2161   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2162   PetscFunctionReturn(PETSC_SUCCESS);
2163 }
2164 
2165 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2166 {
2167   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2168   PetscInt     totalslices = a->totalslices;
2169 
2170   PetscFunctionBegin;
2171   /* If the  matrix dimensions are not equal,or no of nonzeros */
2172   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2173     *flg = PETSC_FALSE;
2174     PetscFunctionReturn(PETSC_SUCCESS);
2175   }
2176   /* if the a->colidx are the same */
2177   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2178   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2179   /* if a->val are the same */
2180   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2181   PetscFunctionReturn(PETSC_SUCCESS);
2182 }
2183 
2184 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2185 {
2186   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2187 
2188   PetscFunctionBegin;
2189   a->idiagvalid = PETSC_FALSE;
2190   PetscFunctionReturn(PETSC_SUCCESS);
2191 }
2192 
2193 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2194 {
2195 #if defined(PETSC_USE_COMPLEX)
2196   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2197   PetscInt     i;
2198   PetscScalar *val = a->val;
2199 
2200   PetscFunctionBegin;
2201   for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2202 #else
2203   PetscFunctionBegin;
2204 #endif
2205   PetscFunctionReturn(PETSC_SUCCESS);
2206 }
2207