xref: /petsc/src/mat/impls/sell/seq/sell.c (revision b8d709ab44c354dba7591c0191c0984ef3bb0cd5)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited      = PETSC_FALSE;
10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14                                " year = 2018\n"
15                                "}\n";
16 
17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18 
19   #include <immintrin.h>
20 
21   #if !defined(_MM_SCALE_8)
22     #define _MM_SCALE_8 8
23   #endif
24 
25   #if defined(__AVX512F__)
26     /* these do not work
27    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29   */
30     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
33       vec_vals = _mm512_loadu_pd(aval); \
34       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36   #elif defined(__AVX2__) && defined(__FMA__)
37     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38       vec_vals = _mm256_loadu_pd(aval); \
39       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42   #endif
43 #endif /* PETSC_HAVE_IMMINTRIN_H */
44 
45 /*@C
46  MatSeqSELLSetPreallocation - For good matrix assembly performance
47  the user should preallocate the matrix storage by setting the parameter nz
48  (or the array nnz).  By setting these parameters accurately, performance
49  during matrix assembly can be increased significantly.
50 
51  Collective
52 
53  Input Parameters:
54  +  B - The `MATSEQSELL` matrix
55  .  nz - number of nonzeros per row (same for all rows)
56  -  nnz - array containing the number of nonzeros in the various rows
57  (possibly different for each row) or NULL
58 
59  Notes:
60  If nnz is given then nz is ignored.
61 
62  Specify the preallocated storage with either nz or nnz (not both).
63  Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
64  allocation.  For large problems you MUST preallocate memory or you
65  will get TERRIBLE performance, see the users' manual chapter on matrices.
66 
67  You can call `MatGetInfo()` to get information on how effective the preallocation was;
68  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
69  You can also run with the option -info and look for messages with the string
70  malloc in them to see if additional memory allocation was needed.
71 
72  Developers: Use nz of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
73  entries or columns indices.
74 
75  The maximum number of nonzeos in any row should be as accurate as possible.
76  If it is underestimated, you will get bad performance due to reallocation
77  (MatSeqXSELLReallocateSELL).
78 
79  Level: intermediate
80 
81  .seealso: `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
82  @*/
83 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
84 {
85   PetscFunctionBegin;
86   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
87   PetscValidType(B, 1);
88   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
89   PetscFunctionReturn(0);
90 }
91 
92 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
93 {
94   Mat_SeqSELL *b;
95   PetscInt     i, j, totalslices;
96   PetscBool    skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
97 
98   PetscFunctionBegin;
99   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
100   if (maxallocrow == MAT_SKIP_ALLOCATION) {
101     skipallocation = PETSC_TRUE;
102     maxallocrow    = 0;
103   }
104 
105   PetscCall(PetscLayoutSetUp(B->rmap));
106   PetscCall(PetscLayoutSetUp(B->cmap));
107 
108   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
109   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
110   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
111   if (rlen) {
112     for (i = 0; i < B->rmap->n; i++) {
113       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
114       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
115     }
116   }
117 
118   B->preallocated = PETSC_TRUE;
119 
120   b = (Mat_SeqSELL *)B->data;
121 
122   totalslices    = PetscCeilInt(B->rmap->n, 8);
123   b->totalslices = totalslices;
124   if (!skipallocation) {
125     if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n));
126 
127     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
128       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
129     }
130     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
131       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
132       else if (maxallocrow < 0) maxallocrow = 1;
133       for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow;
134     } else {
135       maxallocrow  = 0;
136       b->sliidx[0] = 0;
137       for (i = 1; i < totalslices; i++) {
138         b->sliidx[i] = 0;
139         for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]);
140         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
141         PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i]));
142       }
143       /* last slice */
144       b->sliidx[totalslices] = 0;
145       for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
146       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
147       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices];
148     }
149 
150     /* allocate space for val, colidx, rlen */
151     /* FIXME: should B's old memory be unlogged? */
152     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
153     /* FIXME: assuming an element of the bit array takes 8 bits */
154     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
155     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
156     PetscCall(PetscCalloc1(8 * totalslices, &b->rlen));
157 
158     b->singlemalloc = PETSC_TRUE;
159     b->free_val     = PETSC_TRUE;
160     b->free_colidx  = PETSC_TRUE;
161   } else {
162     b->free_val    = PETSC_FALSE;
163     b->free_colidx = PETSC_FALSE;
164   }
165 
166   b->nz               = 0;
167   b->maxallocrow      = maxallocrow;
168   b->rlenmax          = maxallocrow;
169   b->maxallocmat      = b->sliidx[totalslices];
170   B->info.nz_unneeded = (double)b->maxallocmat;
171   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
172   PetscFunctionReturn(0);
173 }
174 
175 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
176 {
177   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
178   PetscInt     shift;
179 
180   PetscFunctionBegin;
181   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
182   if (nz) *nz = a->rlen[row];
183   shift = a->sliidx[row >> 3] + (row & 0x07);
184   if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals));
185   if (idx) {
186     PetscInt j;
187     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j];
188     *idx = a->getrowcols;
189   }
190   if (v) {
191     PetscInt j;
192     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j];
193     *v = a->getrowvals;
194   }
195   PetscFunctionReturn(0);
196 }
197 
198 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
199 {
200   PetscFunctionBegin;
201   PetscFunctionReturn(0);
202 }
203 
204 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
205 {
206   Mat          B;
207   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
208   PetscInt     i;
209 
210   PetscFunctionBegin;
211   if (reuse == MAT_REUSE_MATRIX) {
212     B = *newmat;
213     PetscCall(MatZeroEntries(B));
214   } else {
215     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
216     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
217     PetscCall(MatSetType(B, MATSEQAIJ));
218     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
219   }
220 
221   for (i = 0; i < A->rmap->n; i++) {
222     PetscInt     nz = 0, *cols = NULL;
223     PetscScalar *vals = NULL;
224 
225     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
226     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
227     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
228   }
229 
230   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
231   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
232   B->rmap->bs = A->rmap->bs;
233 
234   if (reuse == MAT_INPLACE_MATRIX) {
235     PetscCall(MatHeaderReplace(A, &B));
236   } else {
237     *newmat = B;
238   }
239   PetscFunctionReturn(0);
240 }
241 
242 #include <../src/mat/impls/aij/seq/aij.h>
243 
244 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
245 {
246   Mat                B;
247   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
248   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
249   const PetscInt    *cols;
250   const PetscScalar *vals;
251 
252   PetscFunctionBegin;
253 
254   if (reuse == MAT_REUSE_MATRIX) {
255     B = *newmat;
256   } else {
257     if (PetscDefined(USE_DEBUG) || !a->ilen) {
258       PetscCall(PetscMalloc1(m, &rowlengths));
259       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
260     }
261     if (PetscDefined(USE_DEBUG) && a->ilen) {
262       PetscBool eq;
263       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
264       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
265       PetscCall(PetscFree(rowlengths));
266       rowlengths = a->ilen;
267     } else if (a->ilen) rowlengths = a->ilen;
268     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
269     PetscCall(MatSetSizes(B, m, n, m, n));
270     PetscCall(MatSetType(B, MATSEQSELL));
271     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
272     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
273   }
274 
275   for (row = 0; row < m; row++) {
276     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
277     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
278     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
279   }
280   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
281   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
282   B->rmap->bs = A->rmap->bs;
283 
284   if (reuse == MAT_INPLACE_MATRIX) {
285     PetscCall(MatHeaderReplace(A, &B));
286   } else {
287     *newmat = B;
288   }
289   PetscFunctionReturn(0);
290 }
291 
292 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
293 {
294   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
295   PetscScalar       *y;
296   const PetscScalar *x;
297   const MatScalar   *aval        = a->val;
298   PetscInt           totalslices = a->totalslices;
299   const PetscInt    *acolidx     = a->colidx;
300   PetscInt           i, j;
301 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
302   __m512d  vec_x, vec_y, vec_vals;
303   __m256i  vec_idx;
304   __mmask8 mask;
305   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
306   __m256i  vec_idx2, vec_idx3, vec_idx4;
307 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
308   __m128i   vec_idx;
309   __m256d   vec_x, vec_y, vec_y2, vec_vals;
310   MatScalar yval;
311   PetscInt  r, rows_left, row, nnz_in_row;
312 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
313   __m128d   vec_x_tmp;
314   __m256d   vec_x, vec_y, vec_y2, vec_vals;
315   MatScalar yval;
316   PetscInt  r, rows_left, row, nnz_in_row;
317 #else
318   PetscScalar sum[8];
319 #endif
320 
321 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
322   #pragma disjoint(*x, *y, *aval)
323 #endif
324 
325   PetscFunctionBegin;
326   PetscCall(VecGetArrayRead(xx, &x));
327   PetscCall(VecGetArray(yy, &y));
328 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
329   for (i = 0; i < totalslices; i++) { /* loop over slices */
330     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
331     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
332 
333     vec_y  = _mm512_setzero_pd();
334     vec_y2 = _mm512_setzero_pd();
335     vec_y3 = _mm512_setzero_pd();
336     vec_y4 = _mm512_setzero_pd();
337 
338     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
339     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
340     case 3:
341       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
342       acolidx += 8;
343       aval += 8;
344       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
345       acolidx += 8;
346       aval += 8;
347       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
348       acolidx += 8;
349       aval += 8;
350       j += 3;
351       break;
352     case 2:
353       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
354       acolidx += 8;
355       aval += 8;
356       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
357       acolidx += 8;
358       aval += 8;
359       j += 2;
360       break;
361     case 1:
362       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
363       acolidx += 8;
364       aval += 8;
365       j += 1;
366       break;
367     }
368   #pragma novector
369     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
370       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
371       acolidx += 8;
372       aval += 8;
373       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
374       acolidx += 8;
375       aval += 8;
376       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
377       acolidx += 8;
378       aval += 8;
379       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
380       acolidx += 8;
381       aval += 8;
382     }
383 
384     vec_y = _mm512_add_pd(vec_y, vec_y2);
385     vec_y = _mm512_add_pd(vec_y, vec_y3);
386     vec_y = _mm512_add_pd(vec_y, vec_y4);
387     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
388       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
389       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
390     } else {
391       _mm512_storeu_pd(&y[8 * i], vec_y);
392     }
393   }
394 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
395   for (i = 0; i < totalslices; i++) { /* loop over full slices */
396     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
397     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
398 
399     /* last slice may have padding rows. Don't use vectorization. */
400     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
401       rows_left = A->rmap->n - 8 * i;
402       for (r = 0; r < rows_left; ++r) {
403         yval       = (MatScalar)0;
404         row        = 8 * i + r;
405         nnz_in_row = a->rlen[row];
406         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
407         y[row] = yval;
408       }
409       break;
410     }
411 
412     vec_y  = _mm256_setzero_pd();
413     vec_y2 = _mm256_setzero_pd();
414 
415   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
416   #pragma novector
417   #pragma unroll(2)
418     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
419       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
420       aval += 4;
421       acolidx += 4;
422       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
423       aval += 4;
424       acolidx += 4;
425     }
426 
427     _mm256_storeu_pd(y + i * 8, vec_y);
428     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
429   }
430 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
431   for (i = 0; i < totalslices; i++) { /* loop over full slices */
432     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
433     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
434 
435     vec_y  = _mm256_setzero_pd();
436     vec_y2 = _mm256_setzero_pd();
437 
438     /* last slice may have padding rows. Don't use vectorization. */
439     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
440       rows_left = A->rmap->n - 8 * i;
441       for (r = 0; r < rows_left; ++r) {
442         yval       = (MatScalar)0;
443         row        = 8 * i + r;
444         nnz_in_row = a->rlen[row];
445         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
446         y[row] = yval;
447       }
448       break;
449     }
450 
451   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
452   #pragma novector
453   #pragma unroll(2)
454     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
455       vec_vals  = _mm256_loadu_pd(aval);
456       vec_x_tmp = _mm_setzero_pd();
457       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
458       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
459       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
460       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
461       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
462       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
463       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
464       aval += 4;
465 
466       vec_vals  = _mm256_loadu_pd(aval);
467       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
468       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
469       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
470       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
471       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
472       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
473       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
474       aval += 4;
475     }
476 
477     _mm256_storeu_pd(y + i * 8, vec_y);
478     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
479   }
480 #else
481   for (i = 0; i < totalslices; i++) { /* loop over slices */
482     for (j = 0; j < 8; j++) sum[j] = 0.0;
483     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
484       sum[0] += aval[j] * x[acolidx[j]];
485       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
486       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
487       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
488       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
489       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
490       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
491       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
492     }
493     if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
494       for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j];
495     } else {
496       for (j = 0; j < 8; j++) y[8 * i + j] = sum[j];
497     }
498   }
499 #endif
500 
501   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
502   PetscCall(VecRestoreArrayRead(xx, &x));
503   PetscCall(VecRestoreArray(yy, &y));
504   PetscFunctionReturn(0);
505 }
506 
507 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
508 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
509 {
510   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
511   PetscScalar       *y, *z;
512   const PetscScalar *x;
513   const MatScalar   *aval        = a->val;
514   PetscInt           totalslices = a->totalslices;
515   const PetscInt    *acolidx     = a->colidx;
516   PetscInt           i, j;
517 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
518   __m512d  vec_x, vec_y, vec_vals;
519   __m256i  vec_idx;
520   __mmask8 mask;
521   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
522   __m256i  vec_idx2, vec_idx3, vec_idx4;
523 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
524   __m128d   vec_x_tmp;
525   __m256d   vec_x, vec_y, vec_y2, vec_vals;
526   MatScalar yval;
527   PetscInt  r, row, nnz_in_row;
528 #else
529   PetscScalar sum[8];
530 #endif
531 
532 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
533   #pragma disjoint(*x, *y, *aval)
534 #endif
535 
536   PetscFunctionBegin;
537   PetscCall(VecGetArrayRead(xx, &x));
538   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
539 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
540   for (i = 0; i < totalslices; i++) { /* loop over slices */
541     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
542     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
543 
544     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
545       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
546       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
547     } else {
548       vec_y = _mm512_loadu_pd(&y[8 * i]);
549     }
550     vec_y2 = _mm512_setzero_pd();
551     vec_y3 = _mm512_setzero_pd();
552     vec_y4 = _mm512_setzero_pd();
553 
554     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
555     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
556     case 3:
557       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
558       acolidx += 8;
559       aval += 8;
560       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
561       acolidx += 8;
562       aval += 8;
563       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
564       acolidx += 8;
565       aval += 8;
566       j += 3;
567       break;
568     case 2:
569       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
570       acolidx += 8;
571       aval += 8;
572       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
573       acolidx += 8;
574       aval += 8;
575       j += 2;
576       break;
577     case 1:
578       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
579       acolidx += 8;
580       aval += 8;
581       j += 1;
582       break;
583     }
584   #pragma novector
585     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
586       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
587       acolidx += 8;
588       aval += 8;
589       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
590       acolidx += 8;
591       aval += 8;
592       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
593       acolidx += 8;
594       aval += 8;
595       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
596       acolidx += 8;
597       aval += 8;
598     }
599 
600     vec_y = _mm512_add_pd(vec_y, vec_y2);
601     vec_y = _mm512_add_pd(vec_y, vec_y3);
602     vec_y = _mm512_add_pd(vec_y, vec_y4);
603     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
604       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
605     } else {
606       _mm512_storeu_pd(&z[8 * i], vec_y);
607     }
608   }
609 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
610   for (i = 0; i < totalslices; i++) { /* loop over full slices */
611     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
612     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
613 
614     /* last slice may have padding rows. Don't use vectorization. */
615     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
616       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
617         row        = 8 * i + r;
618         yval       = (MatScalar)0.0;
619         nnz_in_row = a->rlen[row];
620         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
621         z[row] = y[row] + yval;
622       }
623       break;
624     }
625 
626     vec_y  = _mm256_loadu_pd(y + 8 * i);
627     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
628 
629     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
630     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
631       vec_vals  = _mm256_loadu_pd(aval);
632       vec_x_tmp = _mm_setzero_pd();
633       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
634       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
635       vec_x     = _mm256_setzero_pd();
636       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
637       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
638       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
639       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
640       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
641       aval += 4;
642 
643       vec_vals  = _mm256_loadu_pd(aval);
644       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
645       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
646       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
647       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
648       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
649       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
650       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
651       aval += 4;
652     }
653 
654     _mm256_storeu_pd(z + i * 8, vec_y);
655     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
656   }
657 #else
658   for (i = 0; i < totalslices; i++) { /* loop over slices */
659     for (j = 0; j < 8; j++) sum[j] = 0.0;
660     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
661       sum[0] += aval[j] * x[acolidx[j]];
662       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
663       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
664       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
665       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
666       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
667       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
668       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
669     }
670     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
671       for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j];
672     } else {
673       for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j];
674     }
675   }
676 #endif
677 
678   PetscCall(PetscLogFlops(2.0 * a->nz));
679   PetscCall(VecRestoreArrayRead(xx, &x));
680   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
681   PetscFunctionReturn(0);
682 }
683 
684 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
685 {
686   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
687   PetscScalar       *y;
688   const PetscScalar *x;
689   const MatScalar   *aval    = a->val;
690   const PetscInt    *acolidx = a->colidx;
691   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices;
692 
693 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
694   #pragma disjoint(*x, *y, *aval)
695 #endif
696 
697   PetscFunctionBegin;
698   if (A->symmetric == PETSC_BOOL3_TRUE) {
699     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
700     PetscFunctionReturn(0);
701   }
702   if (zz != yy) PetscCall(VecCopy(zz, yy));
703   PetscCall(VecGetArrayRead(xx, &x));
704   PetscCall(VecGetArray(yy, &y));
705   for (i = 0; i < a->totalslices; i++) { /* loop over slices */
706     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
707       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
708         row        = 8 * i + r;
709         nnz_in_row = a->rlen[row];
710         for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row];
711       }
712       break;
713     }
714     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
715       y[acolidx[j]] += aval[j] * x[8 * i];
716       y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1];
717       y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2];
718       y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3];
719       y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4];
720       y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5];
721       y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6];
722       y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7];
723     }
724   }
725   PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices]));
726   PetscCall(VecRestoreArrayRead(xx, &x));
727   PetscCall(VecRestoreArray(yy, &y));
728   PetscFunctionReturn(0);
729 }
730 
731 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
732 {
733   PetscFunctionBegin;
734   if (A->symmetric == PETSC_BOOL3_TRUE) {
735     PetscCall(MatMult_SeqSELL(A, xx, yy));
736   } else {
737     PetscCall(VecSet(yy, 0.0));
738     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
739   }
740   PetscFunctionReturn(0);
741 }
742 
743 /*
744      Checks for missing diagonals
745 */
746 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
747 {
748   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
749   PetscInt    *diag, i;
750 
751   PetscFunctionBegin;
752   *missing = PETSC_FALSE;
753   if (A->rmap->n > 0 && !(a->colidx)) {
754     *missing = PETSC_TRUE;
755     if (d) *d = 0;
756     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
757   } else {
758     diag = a->diag;
759     for (i = 0; i < A->rmap->n; i++) {
760       if (diag[i] == -1) {
761         *missing = PETSC_TRUE;
762         if (d) *d = i;
763         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
764         break;
765       }
766     }
767   }
768   PetscFunctionReturn(0);
769 }
770 
771 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
772 {
773   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
774   PetscInt     i, j, m = A->rmap->n, shift;
775 
776   PetscFunctionBegin;
777   if (!a->diag) {
778     PetscCall(PetscMalloc1(m, &a->diag));
779     a->free_diag = PETSC_TRUE;
780   }
781   for (i = 0; i < m; i++) {                      /* loop over rows */
782     shift      = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
783     a->diag[i] = -1;
784     for (j = 0; j < a->rlen[i]; j++) {
785       if (a->colidx[shift + j * 8] == i) {
786         a->diag[i] = shift + j * 8;
787         break;
788       }
789     }
790   }
791   PetscFunctionReturn(0);
792 }
793 
794 /*
795   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
796 */
797 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
798 {
799   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
800   PetscInt     i, *diag, m = A->rmap->n;
801   MatScalar   *val = a->val;
802   PetscScalar *idiag, *mdiag;
803 
804   PetscFunctionBegin;
805   if (a->idiagvalid) PetscFunctionReturn(0);
806   PetscCall(MatMarkDiagonal_SeqSELL(A));
807   diag = a->diag;
808   if (!a->idiag) {
809     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
810     val = a->val;
811   }
812   mdiag = a->mdiag;
813   idiag = a->idiag;
814 
815   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
816     for (i = 0; i < m; i++) {
817       mdiag[i] = val[diag[i]];
818       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
819         if (PetscRealPart(fshift)) {
820           PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
821           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
822           A->factorerror_zeropivot_value = 0.0;
823           A->factorerror_zeropivot_row   = i;
824         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
825       }
826       idiag[i] = 1.0 / val[diag[i]];
827     }
828     PetscCall(PetscLogFlops(m));
829   } else {
830     for (i = 0; i < m; i++) {
831       mdiag[i] = val[diag[i]];
832       idiag[i] = omega / (fshift + val[diag[i]]);
833     }
834     PetscCall(PetscLogFlops(2.0 * m));
835   }
836   a->idiagvalid = PETSC_TRUE;
837   PetscFunctionReturn(0);
838 }
839 
840 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
841 {
842   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
843 
844   PetscFunctionBegin;
845   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
846   PetscCall(MatSeqSELLInvalidateDiagonal(A));
847   PetscFunctionReturn(0);
848 }
849 
850 PetscErrorCode MatDestroy_SeqSELL(Mat A)
851 {
852   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
853 
854   PetscFunctionBegin;
855 #if defined(PETSC_USE_LOG)
856   PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz);
857 #endif
858   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
859   PetscCall(ISDestroy(&a->row));
860   PetscCall(ISDestroy(&a->col));
861   PetscCall(PetscFree(a->diag));
862   PetscCall(PetscFree(a->rlen));
863   PetscCall(PetscFree(a->sliidx));
864   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
865   PetscCall(PetscFree(a->solve_work));
866   PetscCall(ISDestroy(&a->icol));
867   PetscCall(PetscFree(a->saved_values));
868   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
869 
870   PetscCall(PetscFree(A->data));
871 
872   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
873   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
874   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
875   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
876   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
877   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
878   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
879   PetscFunctionReturn(0);
880 }
881 
882 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
883 {
884   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
885 
886   PetscFunctionBegin;
887   switch (op) {
888   case MAT_ROW_ORIENTED:
889     a->roworiented = flg;
890     break;
891   case MAT_KEEP_NONZERO_PATTERN:
892     a->keepnonzeropattern = flg;
893     break;
894   case MAT_NEW_NONZERO_LOCATIONS:
895     a->nonew = (flg ? 0 : 1);
896     break;
897   case MAT_NEW_NONZERO_LOCATION_ERR:
898     a->nonew = (flg ? -1 : 0);
899     break;
900   case MAT_NEW_NONZERO_ALLOCATION_ERR:
901     a->nonew = (flg ? -2 : 0);
902     break;
903   case MAT_UNUSED_NONZERO_LOCATION_ERR:
904     a->nounused = (flg ? -1 : 0);
905     break;
906   case MAT_FORCE_DIAGONAL_ENTRIES:
907   case MAT_IGNORE_OFF_PROC_ENTRIES:
908   case MAT_USE_HASH_TABLE:
909   case MAT_SORTED_FULL:
910     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
911     break;
912   case MAT_SPD:
913   case MAT_SYMMETRIC:
914   case MAT_STRUCTURALLY_SYMMETRIC:
915   case MAT_HERMITIAN:
916   case MAT_SYMMETRY_ETERNAL:
917   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
918   case MAT_SPD_ETERNAL:
919     /* These options are handled directly by MatSetOption() */
920     break;
921   default:
922     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
923   }
924   PetscFunctionReturn(0);
925 }
926 
927 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
928 {
929   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
930   PetscInt     i, j, n, shift;
931   PetscScalar *x, zero = 0.0;
932 
933   PetscFunctionBegin;
934   PetscCall(VecGetLocalSize(v, &n));
935   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
936 
937   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
938     PetscInt *diag = a->diag;
939     PetscCall(VecGetArray(v, &x));
940     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
941     PetscCall(VecRestoreArray(v, &x));
942     PetscFunctionReturn(0);
943   }
944 
945   PetscCall(VecSet(v, zero));
946   PetscCall(VecGetArray(v, &x));
947   for (i = 0; i < n; i++) {                 /* loop over rows */
948     shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
949     x[i]  = 0;
950     for (j = 0; j < a->rlen[i]; j++) {
951       if (a->colidx[shift + j * 8] == i) {
952         x[i] = a->val[shift + j * 8];
953         break;
954       }
955     }
956   }
957   PetscCall(VecRestoreArray(v, &x));
958   PetscFunctionReturn(0);
959 }
960 
961 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
962 {
963   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
964   const PetscScalar *l, *r;
965   PetscInt           i, j, m, n, row;
966 
967   PetscFunctionBegin;
968   if (ll) {
969     /* The local size is used so that VecMPI can be passed to this routine
970        by MatDiagonalScale_MPISELL */
971     PetscCall(VecGetLocalSize(ll, &m));
972     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
973     PetscCall(VecGetArrayRead(ll, &l));
974     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
975       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
976         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
977           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row];
978         }
979       } else {
980         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row];
981       }
982     }
983     PetscCall(VecRestoreArrayRead(ll, &l));
984     PetscCall(PetscLogFlops(a->nz));
985   }
986   if (rr) {
987     PetscCall(VecGetLocalSize(rr, &n));
988     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
989     PetscCall(VecGetArrayRead(rr, &r));
990     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
991       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
992         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
993           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
994         }
995       } else {
996         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
997       }
998     }
999     PetscCall(VecRestoreArrayRead(rr, &r));
1000     PetscCall(PetscLogFlops(a->nz));
1001   }
1002   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1003   PetscFunctionReturn(0);
1004 }
1005 
1006 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1007 {
1008   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1009   PetscInt    *cp, i, k, low, high, t, row, col, l;
1010   PetscInt     shift;
1011   MatScalar   *vp;
1012 
1013   PetscFunctionBegin;
1014   for (k = 0; k < m; k++) { /* loop over requested rows */
1015     row = im[k];
1016     if (row < 0) continue;
1017     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1018     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1019     cp    = a->colidx + shift;                  /* pointer to the row */
1020     vp    = a->val + shift;                     /* pointer to the row */
1021     for (l = 0; l < n; l++) {                   /* loop over requested columns */
1022       col = in[l];
1023       if (col < 0) continue;
1024       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1025       high = a->rlen[row];
1026       low  = 0; /* assume unsorted */
1027       while (high - low > 5) {
1028         t = (low + high) / 2;
1029         if (*(cp + t * 8) > col) high = t;
1030         else low = t;
1031       }
1032       for (i = low; i < high; i++) {
1033         if (*(cp + 8 * i) > col) break;
1034         if (*(cp + 8 * i) == col) {
1035           *v++ = *(vp + 8 * i);
1036           goto finished;
1037         }
1038       }
1039       *v++ = 0.0;
1040     finished:;
1041     }
1042   }
1043   PetscFunctionReturn(0);
1044 }
1045 
1046 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1047 {
1048   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1049   PetscInt          i, j, m = A->rmap->n, shift;
1050   const char       *name;
1051   PetscViewerFormat format;
1052 
1053   PetscFunctionBegin;
1054   PetscCall(PetscViewerGetFormat(viewer, &format));
1055   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1056     PetscInt nofinalvalue = 0;
1057     /*
1058     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1059       nofinalvalue = 1;
1060     }
1061     */
1062     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1063     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1064     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1065 #if defined(PETSC_USE_COMPLEX)
1066     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1067 #else
1068     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1069 #endif
1070     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1071 
1072     for (i = 0; i < m; i++) {
1073       shift = a->sliidx[i >> 3] + (i & 0x07);
1074       for (j = 0; j < a->rlen[i]; j++) {
1075 #if defined(PETSC_USE_COMPLEX)
1076         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1077 #else
1078         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j]));
1079 #endif
1080       }
1081     }
1082     /*
1083     if (nofinalvalue) {
1084 #if defined(PETSC_USE_COMPLEX)
1085       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1086 #else
1087       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1088 #endif
1089     }
1090     */
1091     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1092     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1093     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1094   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1095     PetscFunctionReturn(0);
1096   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1097     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1098     for (i = 0; i < m; i++) {
1099       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1100       shift = a->sliidx[i >> 3] + (i & 0x07);
1101       for (j = 0; j < a->rlen[i]; j++) {
1102 #if defined(PETSC_USE_COMPLEX)
1103         if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1104           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1105         } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1106           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1107         } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1108           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1109         }
1110 #else
1111         if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1112 #endif
1113       }
1114       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1115     }
1116     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1117   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1118     PetscInt    cnt = 0, jcnt;
1119     PetscScalar value;
1120 #if defined(PETSC_USE_COMPLEX)
1121     PetscBool realonly = PETSC_TRUE;
1122     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1123       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1124         realonly = PETSC_FALSE;
1125         break;
1126       }
1127     }
1128 #endif
1129 
1130     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1131     for (i = 0; i < m; i++) {
1132       jcnt  = 0;
1133       shift = a->sliidx[i >> 3] + (i & 0x07);
1134       for (j = 0; j < A->cmap->n; j++) {
1135         if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) {
1136           value = a->val[cnt++];
1137           jcnt++;
1138         } else {
1139           value = 0.0;
1140         }
1141 #if defined(PETSC_USE_COMPLEX)
1142         if (realonly) {
1143           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1144         } else {
1145           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1146         }
1147 #else
1148         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1149 #endif
1150       }
1151       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1152     }
1153     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1154   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1155     PetscInt fshift = 1;
1156     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1157 #if defined(PETSC_USE_COMPLEX)
1158     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1159 #else
1160     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1161 #endif
1162     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1163     for (i = 0; i < m; i++) {
1164       shift = a->sliidx[i >> 3] + (i & 0x07);
1165       for (j = 0; j < a->rlen[i]; j++) {
1166 #if defined(PETSC_USE_COMPLEX)
1167         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1168 #else
1169         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j]));
1170 #endif
1171       }
1172     }
1173     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1174   } else if (format == PETSC_VIEWER_NATIVE) {
1175     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1176       PetscInt row;
1177       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1178       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
1179 #if defined(PETSC_USE_COMPLEX)
1180         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1181           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1182         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1183           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1184         } else {
1185           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1186         }
1187 #else
1188         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j]));
1189 #endif
1190       }
1191     }
1192   } else {
1193     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1194     if (A->factortype) {
1195       for (i = 0; i < m; i++) {
1196         shift = a->sliidx[i >> 3] + (i & 0x07);
1197         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1198         /* L part */
1199         for (j = shift; j < a->diag[i]; j += 8) {
1200 #if defined(PETSC_USE_COMPLEX)
1201           if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) {
1202             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1203           } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) {
1204             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1205           } else {
1206             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1207           }
1208 #else
1209           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1210 #endif
1211         }
1212         /* diagonal */
1213         j = a->diag[i];
1214 #if defined(PETSC_USE_COMPLEX)
1215         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1216           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1217         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1218           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1219         } else {
1220           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1221         }
1222 #else
1223         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1224 #endif
1225 
1226         /* U part */
1227         for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) {
1228 #if defined(PETSC_USE_COMPLEX)
1229           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1230             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1231           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1232             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1233           } else {
1234             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1235           }
1236 #else
1237           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1238 #endif
1239         }
1240         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1241       }
1242     } else {
1243       for (i = 0; i < m; i++) {
1244         shift = a->sliidx[i >> 3] + (i & 0x07);
1245         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1246         for (j = 0; j < a->rlen[i]; j++) {
1247 #if defined(PETSC_USE_COMPLEX)
1248           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1249             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1250           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1251             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1252           } else {
1253             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1254           }
1255 #else
1256           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1257 #endif
1258         }
1259         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1260       }
1261     }
1262     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1263   }
1264   PetscCall(PetscViewerFlush(viewer));
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 #include <petscdraw.h>
1269 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1270 {
1271   Mat               A = (Mat)Aa;
1272   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1273   PetscInt          i, j, m = A->rmap->n, shift;
1274   int               color;
1275   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1276   PetscViewer       viewer;
1277   PetscViewerFormat format;
1278 
1279   PetscFunctionBegin;
1280   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1281   PetscCall(PetscViewerGetFormat(viewer, &format));
1282   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1283 
1284   /* loop over matrix elements drawing boxes */
1285 
1286   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1287     PetscDrawCollectiveBegin(draw);
1288     /* Blue for negative, Cyan for zero and  Red for positive */
1289     color = PETSC_DRAW_BLUE;
1290     for (i = 0; i < m; i++) {
1291       shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1292       y_l   = m - i - 1.0;
1293       y_r   = y_l + 1.0;
1294       for (j = 0; j < a->rlen[i]; j++) {
1295         x_l = a->colidx[shift + j * 8];
1296         x_r = x_l + 1.0;
1297         if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue;
1298         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1299       }
1300     }
1301     color = PETSC_DRAW_CYAN;
1302     for (i = 0; i < m; i++) {
1303       shift = a->sliidx[i >> 3] + (i & 0x07);
1304       y_l   = m - i - 1.0;
1305       y_r   = y_l + 1.0;
1306       for (j = 0; j < a->rlen[i]; j++) {
1307         x_l = a->colidx[shift + j * 8];
1308         x_r = x_l + 1.0;
1309         if (a->val[shift + 8 * j] != 0.) continue;
1310         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1311       }
1312     }
1313     color = PETSC_DRAW_RED;
1314     for (i = 0; i < m; i++) {
1315       shift = a->sliidx[i >> 3] + (i & 0x07);
1316       y_l   = m - i - 1.0;
1317       y_r   = y_l + 1.0;
1318       for (j = 0; j < a->rlen[i]; j++) {
1319         x_l = a->colidx[shift + j * 8];
1320         x_r = x_l + 1.0;
1321         if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue;
1322         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1323       }
1324     }
1325     PetscDrawCollectiveEnd(draw);
1326   } else {
1327     /* use contour shading to indicate magnitude of values */
1328     /* first determine max of all nonzero values */
1329     PetscReal minv = 0.0, maxv = 0.0;
1330     PetscInt  count = 0;
1331     PetscDraw popup;
1332     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1333       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1334     }
1335     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1336     PetscCall(PetscDrawGetPopup(draw, &popup));
1337     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1338 
1339     PetscDrawCollectiveBegin(draw);
1340     for (i = 0; i < m; i++) {
1341       shift = a->sliidx[i >> 3] + (i & 0x07);
1342       y_l   = m - i - 1.0;
1343       y_r   = y_l + 1.0;
1344       for (j = 0; j < a->rlen[i]; j++) {
1345         x_l   = a->colidx[shift + j * 8];
1346         x_r   = x_l + 1.0;
1347         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1348         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1349         count++;
1350       }
1351     }
1352     PetscDrawCollectiveEnd(draw);
1353   }
1354   PetscFunctionReturn(0);
1355 }
1356 
1357 #include <petscdraw.h>
1358 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1359 {
1360   PetscDraw draw;
1361   PetscReal xr, yr, xl, yl, h, w;
1362   PetscBool isnull;
1363 
1364   PetscFunctionBegin;
1365   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1366   PetscCall(PetscDrawIsNull(draw, &isnull));
1367   if (isnull) PetscFunctionReturn(0);
1368 
1369   xr = A->cmap->n;
1370   yr = A->rmap->n;
1371   h  = yr / 10.0;
1372   w  = xr / 10.0;
1373   xr += w;
1374   yr += h;
1375   xl = -w;
1376   yl = -h;
1377   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1378   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1379   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1380   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1381   PetscCall(PetscDrawSave(draw));
1382   PetscFunctionReturn(0);
1383 }
1384 
1385 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1386 {
1387   PetscBool iascii, isbinary, isdraw;
1388 
1389   PetscFunctionBegin;
1390   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1391   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1392   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1393   if (iascii) {
1394     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1395   } else if (isbinary) {
1396     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1397   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1398   PetscFunctionReturn(0);
1399 }
1400 
1401 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1402 {
1403   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1404   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1405   MatScalar   *vp;
1406 
1407   PetscFunctionBegin;
1408   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1409   /* To do: compress out the unused elements */
1410   PetscCall(MatMarkDiagonal_SeqSELL(A));
1411   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1412   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1413   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1414   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1415   for (i = 0; i < a->totalslices; ++i) {
1416     shift = a->sliidx[i];                                      /* starting index of the slice */
1417     cp    = a->colidx + shift;                                 /* pointer to the column indices of the slice */
1418     vp    = a->val + shift;                                    /* pointer to the nonzero values of the slice */
1419     for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */
1420       row  = 8 * i + row_in_slice;
1421       nrow = a->rlen[row]; /* number of nonzeros in row */
1422       /*
1423         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1424         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1425       */
1426       lastcol = 0;
1427       if (nrow > 0) {                                /* nonempty row */
1428         lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1429       } else if (!row_in_slice) {                    /* first row of the currect slice is empty */
1430         for (j = 1; j < 8; j++) {
1431           if (a->rlen[8 * i + j]) {
1432             lastcol = cp[j];
1433             break;
1434           }
1435         }
1436       } else {
1437         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1438       }
1439 
1440       for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) {
1441         cp[8 * k + row_in_slice] = lastcol;
1442         vp[8 * k + row_in_slice] = (MatScalar)0;
1443       }
1444     }
1445   }
1446 
1447   A->info.mallocs += a->reallocs;
1448   a->reallocs = 0;
1449 
1450   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1451   PetscFunctionReturn(0);
1452 }
1453 
1454 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1455 {
1456   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1457 
1458   PetscFunctionBegin;
1459   info->block_size   = 1.0;
1460   info->nz_allocated = a->maxallocmat;
1461   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1462   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1463   info->assemblies   = A->num_ass;
1464   info->mallocs      = A->info.mallocs;
1465   info->memory       = 0; /* REVIEW ME */
1466   if (A->factortype) {
1467     info->fill_ratio_given  = A->info.fill_ratio_given;
1468     info->fill_ratio_needed = A->info.fill_ratio_needed;
1469     info->factor_mallocs    = A->info.factor_mallocs;
1470   } else {
1471     info->fill_ratio_given  = 0;
1472     info->fill_ratio_needed = 0;
1473     info->factor_mallocs    = 0;
1474   }
1475   PetscFunctionReturn(0);
1476 }
1477 
1478 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1479 {
1480   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1481   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1482   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1483   MatScalar   *vp, value;
1484 
1485   PetscFunctionBegin;
1486   for (k = 0; k < m; k++) { /* loop over added rows */
1487     row = im[k];
1488     if (row < 0) continue;
1489     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1490     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1491     cp    = a->colidx + shift;                  /* pointer to the row */
1492     vp    = a->val + shift;                     /* pointer to the row */
1493     nrow  = a->rlen[row];
1494     low   = 0;
1495     high  = nrow;
1496 
1497     for (l = 0; l < n; l++) { /* loop over added columns */
1498       col = in[l];
1499       if (col < 0) continue;
1500       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1501       if (a->roworiented) {
1502         value = v[l + k * n];
1503       } else {
1504         value = v[k + l * m];
1505       }
1506       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1507 
1508       /* search in this row for the specified column, i indicates the column to be set */
1509       if (col <= lastcol) low = 0;
1510       else high = nrow;
1511       lastcol = col;
1512       while (high - low > 5) {
1513         t = (low + high) / 2;
1514         if (*(cp + t * 8) > col) high = t;
1515         else low = t;
1516       }
1517       for (i = low; i < high; i++) {
1518         if (*(cp + i * 8) > col) break;
1519         if (*(cp + i * 8) == col) {
1520           if (is == ADD_VALUES) *(vp + i * 8) += value;
1521           else *(vp + i * 8) = value;
1522           low = i + 1;
1523           goto noinsert;
1524         }
1525       }
1526       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1527       if (nonew == 1) goto noinsert;
1528       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1529       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1530       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar);
1531       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1532       for (ii = nrow - 1; ii >= i; ii--) {
1533         *(cp + (ii + 1) * 8) = *(cp + ii * 8);
1534         *(vp + (ii + 1) * 8) = *(vp + ii * 8);
1535       }
1536       a->rlen[row]++;
1537       *(cp + i * 8) = col;
1538       *(vp + i * 8) = value;
1539       a->nz++;
1540       A->nonzerostate++;
1541       low = i + 1;
1542       high++;
1543       nrow++;
1544     noinsert:;
1545     }
1546     a->rlen[row] = nrow;
1547   }
1548   PetscFunctionReturn(0);
1549 }
1550 
1551 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1552 {
1553   PetscFunctionBegin;
1554   /* If the two matrices have the same copy implementation, use fast copy. */
1555   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1556     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1557     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1558 
1559     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1560     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1561   } else {
1562     PetscCall(MatCopy_Basic(A, B, str));
1563   }
1564   PetscFunctionReturn(0);
1565 }
1566 
1567 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1568 {
1569   PetscFunctionBegin;
1570   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1571   PetscFunctionReturn(0);
1572 }
1573 
1574 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1575 {
1576   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1577 
1578   PetscFunctionBegin;
1579   *array = a->val;
1580   PetscFunctionReturn(0);
1581 }
1582 
1583 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1584 {
1585   PetscFunctionBegin;
1586   PetscFunctionReturn(0);
1587 }
1588 
1589 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1590 {
1591   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1592   PetscInt     i;
1593   MatScalar   *aval = a->val;
1594 
1595   PetscFunctionBegin;
1596   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1597   PetscFunctionReturn(0);
1598 }
1599 
1600 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1601 {
1602   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1603   PetscInt     i;
1604   MatScalar   *aval = a->val;
1605 
1606   PetscFunctionBegin;
1607   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1608   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1609   PetscFunctionReturn(0);
1610 }
1611 
1612 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1613 {
1614   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1615   MatScalar   *aval   = a->val;
1616   PetscScalar  oalpha = alpha;
1617   PetscBLASInt one    = 1, size;
1618 
1619   PetscFunctionBegin;
1620   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1621   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1622   PetscCall(PetscLogFlops(a->nz));
1623   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1624   PetscFunctionReturn(0);
1625 }
1626 
1627 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1628 {
1629   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1630 
1631   PetscFunctionBegin;
1632   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1633   PetscCall(MatShift_Basic(Y, a));
1634   PetscFunctionReturn(0);
1635 }
1636 
1637 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1638 {
1639   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1640   PetscScalar       *x, sum, *t;
1641   const MatScalar   *idiag = NULL, *mdiag;
1642   const PetscScalar *b, *xb;
1643   PetscInt           n, m = A->rmap->n, i, j, shift;
1644   const PetscInt    *diag;
1645 
1646   PetscFunctionBegin;
1647   its = its * lits;
1648 
1649   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1650   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1651   a->fshift = fshift;
1652   a->omega  = omega;
1653 
1654   diag  = a->diag;
1655   t     = a->ssor_work;
1656   idiag = a->idiag;
1657   mdiag = a->mdiag;
1658 
1659   PetscCall(VecGetArray(xx, &x));
1660   PetscCall(VecGetArrayRead(bb, &b));
1661   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1662   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1663   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1664   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1665 
1666   if (flag & SOR_ZERO_INITIAL_GUESS) {
1667     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1668       for (i = 0; i < m; i++) {
1669         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1670         sum   = b[i];
1671         n     = (diag[i] - shift) / 8;
1672         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1673         t[i] = sum;
1674         x[i] = sum * idiag[i];
1675       }
1676       xb = t;
1677       PetscCall(PetscLogFlops(a->nz));
1678     } else xb = b;
1679     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1680       for (i = m - 1; i >= 0; i--) {
1681         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1682         sum   = xb[i];
1683         n     = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1684         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1685         if (xb == b) {
1686           x[i] = sum * idiag[i];
1687         } else {
1688           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1689         }
1690       }
1691       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1692     }
1693     its--;
1694   }
1695   while (its--) {
1696     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1697       for (i = 0; i < m; i++) {
1698         /* lower */
1699         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1700         sum   = b[i];
1701         n     = (diag[i] - shift) / 8;
1702         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1703         t[i] = sum; /* save application of the lower-triangular part */
1704         /* upper */
1705         n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1706         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1707         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1708       }
1709       xb = t;
1710       PetscCall(PetscLogFlops(2.0 * a->nz));
1711     } else xb = b;
1712     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1713       for (i = m - 1; i >= 0; i--) {
1714         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1715         sum   = xb[i];
1716         if (xb == b) {
1717           /* whole matrix (no checkpointing available) */
1718           n = a->rlen[i];
1719           for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1720           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1721         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1722           n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1723           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1724           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1725         }
1726       }
1727       if (xb == b) {
1728         PetscCall(PetscLogFlops(2.0 * a->nz));
1729       } else {
1730         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1731       }
1732     }
1733   }
1734   PetscCall(VecRestoreArray(xx, &x));
1735   PetscCall(VecRestoreArrayRead(bb, &b));
1736   PetscFunctionReturn(0);
1737 }
1738 
1739 /* -------------------------------------------------------------------*/
1740 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1741                                        MatGetRow_SeqSELL,
1742                                        MatRestoreRow_SeqSELL,
1743                                        MatMult_SeqSELL,
1744                                        /* 4*/ MatMultAdd_SeqSELL,
1745                                        MatMultTranspose_SeqSELL,
1746                                        MatMultTransposeAdd_SeqSELL,
1747                                        NULL,
1748                                        NULL,
1749                                        NULL,
1750                                        /* 10*/ NULL,
1751                                        NULL,
1752                                        NULL,
1753                                        MatSOR_SeqSELL,
1754                                        NULL,
1755                                        /* 15*/ MatGetInfo_SeqSELL,
1756                                        MatEqual_SeqSELL,
1757                                        MatGetDiagonal_SeqSELL,
1758                                        MatDiagonalScale_SeqSELL,
1759                                        NULL,
1760                                        /* 20*/ NULL,
1761                                        MatAssemblyEnd_SeqSELL,
1762                                        MatSetOption_SeqSELL,
1763                                        MatZeroEntries_SeqSELL,
1764                                        /* 24*/ NULL,
1765                                        NULL,
1766                                        NULL,
1767                                        NULL,
1768                                        NULL,
1769                                        /* 29*/ MatSetUp_SeqSELL,
1770                                        NULL,
1771                                        NULL,
1772                                        NULL,
1773                                        NULL,
1774                                        /* 34*/ MatDuplicate_SeqSELL,
1775                                        NULL,
1776                                        NULL,
1777                                        NULL,
1778                                        NULL,
1779                                        /* 39*/ NULL,
1780                                        NULL,
1781                                        NULL,
1782                                        MatGetValues_SeqSELL,
1783                                        MatCopy_SeqSELL,
1784                                        /* 44*/ NULL,
1785                                        MatScale_SeqSELL,
1786                                        MatShift_SeqSELL,
1787                                        NULL,
1788                                        NULL,
1789                                        /* 49*/ NULL,
1790                                        NULL,
1791                                        NULL,
1792                                        NULL,
1793                                        NULL,
1794                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1795                                        NULL,
1796                                        NULL,
1797                                        NULL,
1798                                        NULL,
1799                                        /* 59*/ NULL,
1800                                        MatDestroy_SeqSELL,
1801                                        MatView_SeqSELL,
1802                                        NULL,
1803                                        NULL,
1804                                        /* 64*/ NULL,
1805                                        NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        NULL,
1809                                        /* 69*/ NULL,
1810                                        NULL,
1811                                        NULL,
1812                                        NULL,
1813                                        NULL,
1814                                        /* 74*/ NULL,
1815                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1816                                        NULL,
1817                                        NULL,
1818                                        NULL,
1819                                        /* 79*/ NULL,
1820                                        NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        NULL,
1824                                        /* 84*/ NULL,
1825                                        NULL,
1826                                        NULL,
1827                                        NULL,
1828                                        NULL,
1829                                        /* 89*/ NULL,
1830                                        NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        NULL,
1834                                        /* 94*/ NULL,
1835                                        NULL,
1836                                        NULL,
1837                                        NULL,
1838                                        NULL,
1839                                        /* 99*/ NULL,
1840                                        NULL,
1841                                        NULL,
1842                                        MatConjugate_SeqSELL,
1843                                        NULL,
1844                                        /*104*/ NULL,
1845                                        NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        NULL,
1849                                        /*109*/ NULL,
1850                                        NULL,
1851                                        NULL,
1852                                        NULL,
1853                                        MatMissingDiagonal_SeqSELL,
1854                                        /*114*/ NULL,
1855                                        NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        NULL,
1859                                        /*119*/ NULL,
1860                                        NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        NULL,
1864                                        /*124*/ NULL,
1865                                        NULL,
1866                                        NULL,
1867                                        NULL,
1868                                        NULL,
1869                                        /*129*/ NULL,
1870                                        NULL,
1871                                        NULL,
1872                                        NULL,
1873                                        NULL,
1874                                        /*134*/ NULL,
1875                                        NULL,
1876                                        NULL,
1877                                        NULL,
1878                                        NULL,
1879                                        /*139*/ NULL,
1880                                        NULL,
1881                                        NULL,
1882                                        MatFDColoringSetUp_SeqXAIJ,
1883                                        NULL,
1884                                        /*144*/ NULL,
1885                                        NULL,
1886                                        NULL,
1887                                        NULL,
1888                                        NULL,
1889                                        NULL,
1890                                        /*150*/ NULL};
1891 
1892 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1893 {
1894   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1895 
1896   PetscFunctionBegin;
1897   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1898 
1899   /* allocate space for values if not already there */
1900   if (!a->saved_values) { PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); }
1901 
1902   /* copy values over */
1903   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1904   PetscFunctionReturn(0);
1905 }
1906 
1907 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1908 {
1909   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1910 
1911   PetscFunctionBegin;
1912   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1913   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1914   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1915   PetscFunctionReturn(0);
1916 }
1917 
1918 /*@C
1919  MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
1920 
1921  Not Collective
1922 
1923  Input Parameters:
1924  .  mat - a `MATSEQSELL` matrix
1925  .  array - pointer to the data
1926 
1927  Level: intermediate
1928 
1929  .seealso: `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1930  @*/
1931 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
1932 {
1933   PetscFunctionBegin;
1934   PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
1935   PetscFunctionReturn(0);
1936 }
1937 
1938 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1939 {
1940   Mat_SeqSELL *b;
1941   PetscMPIInt  size;
1942 
1943   PetscFunctionBegin;
1944   PetscCall(PetscCitationsRegister(citation, &cited));
1945   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
1946   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
1947 
1948   PetscCall(PetscNew(&b));
1949 
1950   B->data = (void *)b;
1951 
1952   PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
1953 
1954   b->row                = NULL;
1955   b->col                = NULL;
1956   b->icol               = NULL;
1957   b->reallocs           = 0;
1958   b->ignorezeroentries  = PETSC_FALSE;
1959   b->roworiented        = PETSC_TRUE;
1960   b->nonew              = 0;
1961   b->diag               = NULL;
1962   b->solve_work         = NULL;
1963   B->spptr              = NULL;
1964   b->saved_values       = NULL;
1965   b->idiag              = NULL;
1966   b->mdiag              = NULL;
1967   b->ssor_work          = NULL;
1968   b->omega              = 1.0;
1969   b->fshift             = 0.0;
1970   b->idiagvalid         = PETSC_FALSE;
1971   b->keepnonzeropattern = PETSC_FALSE;
1972 
1973   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
1974   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
1975   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
1976   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
1977   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
1978   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
1979   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
1980   PetscFunctionReturn(0);
1981 }
1982 
1983 /*
1984  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1985  */
1986 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
1987 {
1988   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
1989   PetscInt     i, m                           = A->rmap->n;
1990   PetscInt     totalslices = a->totalslices;
1991 
1992   PetscFunctionBegin;
1993   C->factortype = A->factortype;
1994   c->row        = NULL;
1995   c->col        = NULL;
1996   c->icol       = NULL;
1997   c->reallocs   = 0;
1998   C->assembled  = PETSC_TRUE;
1999 
2000   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2001   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2002 
2003   PetscCall(PetscMalloc1(8 * totalslices, &c->rlen));
2004   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2005 
2006   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2007   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2008 
2009   /* allocate the matrix space */
2010   if (mallocmatspace) {
2011     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2012 
2013     c->singlemalloc = PETSC_TRUE;
2014 
2015     if (m > 0) {
2016       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2017       if (cpvalues == MAT_COPY_VALUES) {
2018         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2019       } else {
2020         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2021       }
2022     }
2023   }
2024 
2025   c->ignorezeroentries = a->ignorezeroentries;
2026   c->roworiented       = a->roworiented;
2027   c->nonew             = a->nonew;
2028   if (a->diag) {
2029     PetscCall(PetscMalloc1(m, &c->diag));
2030     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2031   } else c->diag = NULL;
2032 
2033   c->solve_work         = NULL;
2034   c->saved_values       = NULL;
2035   c->idiag              = NULL;
2036   c->ssor_work          = NULL;
2037   c->keepnonzeropattern = a->keepnonzeropattern;
2038   c->free_val           = PETSC_TRUE;
2039   c->free_colidx        = PETSC_TRUE;
2040 
2041   c->maxallocmat  = a->maxallocmat;
2042   c->maxallocrow  = a->maxallocrow;
2043   c->rlenmax      = a->rlenmax;
2044   c->nz           = a->nz;
2045   C->preallocated = PETSC_TRUE;
2046 
2047   c->nonzerorowcnt = a->nonzerorowcnt;
2048   C->nonzerostate  = A->nonzerostate;
2049 
2050   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2051   PetscFunctionReturn(0);
2052 }
2053 
2054 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2055 {
2056   PetscFunctionBegin;
2057   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2058   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2059   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2060   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2061   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2062   PetscFunctionReturn(0);
2063 }
2064 
2065 /*MC
2066    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2067    based on the sliced Ellpack format
2068 
2069    Options Database Keys:
2070 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2071 
2072    Level: beginner
2073 
2074 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2075 M*/
2076 
2077 /*MC
2078    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2079 
2080    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2081    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2082   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2083   for communicators controlling multiple processes.  It is recommended that you call both of
2084   the above preallocation routines for simplicity.
2085 
2086    Options Database Keys:
2087 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2088 
2089   Level: beginner
2090 
2091   Notes:
2092    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2093 
2094    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2095    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2096 
2097   Developer Notes:
2098    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2099 
2100    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2101 .vb
2102                             (2 0  3 4)
2103    Consider the matrix A =  (5 0  6 0)
2104                             (0 0  7 8)
2105                             (0 0  9 9)
2106 
2107    symbolically the Ellpack format can be written as
2108 
2109         (2 3 4 |)           (0 2 3 |)
2110    v =  (5 6 0 |)  colidx = (0 2 2 |)
2111         --------            ---------
2112         (7 8 |)             (2 3 |)
2113         (9 9 |)             (2 3 |)
2114 
2115     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2116     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2117     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2118 
2119     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2120 
2121 .ve
2122 
2123       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2124 
2125  References:
2126 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2127    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2128 
2129 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2130 M*/
2131 
2132 /*@C
2133        MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2134 
2135  Collective on comm
2136 
2137  Input Parameters:
2138 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
2139 .  m - number of rows
2140 .  n - number of columns
2141 .  rlenmax - maximum number of nonzeros in a row
2142 -  rlen - array containing the number of nonzeros in the various rows
2143  (possibly different for each row) or NULL
2144 
2145  Output Parameter:
2146 .  A - the matrix
2147 
2148  It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2149  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2150  [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2151 
2152  Notes:
2153  If nnz is given then nz is ignored
2154 
2155  Specify the preallocated storage with either rlenmax or rlen (not both).
2156  Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory
2157  allocation.  For large problems you MUST preallocate memory or you
2158  will get TERRIBLE performance, see the users' manual chapter on matrices.
2159 
2160  Level: intermediate
2161 
2162  .seealso: `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2163  @*/
2164 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A)
2165 {
2166   PetscFunctionBegin;
2167   PetscCall(MatCreate(comm, A));
2168   PetscCall(MatSetSizes(*A, m, n, m, n));
2169   PetscCall(MatSetType(*A, MATSEQSELL));
2170   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen));
2171   PetscFunctionReturn(0);
2172 }
2173 
2174 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2175 {
2176   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2177   PetscInt     totalslices = a->totalslices;
2178 
2179   PetscFunctionBegin;
2180   /* If the  matrix dimensions are not equal,or no of nonzeros */
2181   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2182     *flg = PETSC_FALSE;
2183     PetscFunctionReturn(0);
2184   }
2185   /* if the a->colidx are the same */
2186   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2187   if (!*flg) PetscFunctionReturn(0);
2188   /* if a->val are the same */
2189   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2190   PetscFunctionReturn(0);
2191 }
2192 
2193 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2194 {
2195   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2196 
2197   PetscFunctionBegin;
2198   a->idiagvalid = PETSC_FALSE;
2199   PetscFunctionReturn(0);
2200 }
2201 
2202 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2203 {
2204 #if defined(PETSC_USE_COMPLEX)
2205   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2206   PetscInt     i;
2207   PetscScalar *val = a->val;
2208 
2209   PetscFunctionBegin;
2210   for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2211 #else
2212   PetscFunctionBegin;
2213 #endif
2214   PetscFunctionReturn(0);
2215 }
2216