xref: /petsc/src/mat/impls/sell/seq/sell.c (revision d2522c19e8fa9bca20aaca277941d9a63e71db6a)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited      = PETSC_FALSE;
10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14                                " year = 2018\n"
15                                "}\n";
16 
17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18 
19 #include <immintrin.h>
20 
21 #if !defined(_MM_SCALE_8)
22 #define _MM_SCALE_8 8
23 #endif
24 
25 #if defined(__AVX512F__)
26 /* these do not work
27    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29   */
30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31   /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32   vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
33   vec_vals = _mm512_loadu_pd(aval); \
34   vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35   vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36 #elif defined(__AVX2__) && defined(__FMA__)
37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38   vec_vals = _mm256_loadu_pd(aval); \
39   vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40   vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41   vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42 #endif
43 #endif /* PETSC_HAVE_IMMINTRIN_H */
44 
45 /*@C
46  MatSeqSELLSetPreallocation - For good matrix assembly performance
47  the user should preallocate the matrix storage by setting the parameter nz
48  (or the array nnz).  By setting these parameters accurately, performance
49  during matrix assembly can be increased significantly.
50 
51  Collective
52 
53  Input Parameters:
54  +  B - The matrix
55  .  nz - number of nonzeros per row (same for all rows)
56  -  nnz - array containing the number of nonzeros in the various rows
57  (possibly different for each row) or NULL
58 
59  Notes:
60  If nnz is given then nz is ignored.
61 
62  Specify the preallocated storage with either nz or nnz (not both).
63  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
64  allocation.  For large problems you MUST preallocate memory or you
65  will get TERRIBLE performance, see the users' manual chapter on matrices.
66 
67  You can call MatGetInfo() to get information on how effective the preallocation was;
68  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
69  You can also run with the option -info and look for messages with the string
70  malloc in them to see if additional memory allocation was needed.
71 
72  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
73  entries or columns indices.
74 
75  The maximum number of nonzeos in any row should be as accurate as possible.
76  If it is underestimated, you will get bad performance due to reallocation
77  (MatSeqXSELLReallocateSELL).
78 
79  Level: intermediate
80 
81  .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
82 
83  @*/
84 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) {
85   PetscFunctionBegin;
86   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
87   PetscValidType(B, 1);
88   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
89   PetscFunctionReturn(0);
90 }
91 
92 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) {
93   Mat_SeqSELL *b;
94   PetscInt     i, j, totalslices;
95   PetscBool    skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
96 
97   PetscFunctionBegin;
98   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
99   if (maxallocrow == MAT_SKIP_ALLOCATION) {
100     skipallocation = PETSC_TRUE;
101     maxallocrow    = 0;
102   }
103 
104   PetscCall(PetscLayoutSetUp(B->rmap));
105   PetscCall(PetscLayoutSetUp(B->cmap));
106 
107   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
108   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
109   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
110   if (rlen) {
111     for (i = 0; i < B->rmap->n; i++) {
112       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
113       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
114     }
115   }
116 
117   B->preallocated = PETSC_TRUE;
118 
119   b = (Mat_SeqSELL *)B->data;
120 
121   totalslices    = PetscCeilInt(B->rmap->n, 8);
122   b->totalslices = totalslices;
123   if (!skipallocation) {
124     if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n));
125 
126     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
127       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
128       PetscCall(PetscLogObjectMemory((PetscObject)B, (totalslices + 1) * sizeof(PetscInt)));
129     }
130     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
131       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
132       else if (maxallocrow < 0) maxallocrow = 1;
133       for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow;
134     } else {
135       maxallocrow  = 0;
136       b->sliidx[0] = 0;
137       for (i = 1; i < totalslices; i++) {
138         b->sliidx[i] = 0;
139         for (j = 0; j < 8; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]); }
140         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
141         PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i]));
142       }
143       /* last slice */
144       b->sliidx[totalslices] = 0;
145       for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
146       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
147       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices];
148     }
149 
150     /* allocate space for val, colidx, rlen */
151     /* FIXME: should B's old memory be unlogged? */
152     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
153     /* FIXME: assuming an element of the bit array takes 8 bits */
154     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
155     PetscCall(PetscLogObjectMemory((PetscObject)B, b->sliidx[totalslices] * (sizeof(PetscScalar) + sizeof(PetscInt))));
156     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
157     PetscCall(PetscCalloc1(8 * totalslices, &b->rlen));
158     PetscCall(PetscLogObjectMemory((PetscObject)B, 8 * totalslices * sizeof(PetscInt)));
159 
160     b->singlemalloc = PETSC_TRUE;
161     b->free_val     = PETSC_TRUE;
162     b->free_colidx  = PETSC_TRUE;
163   } else {
164     b->free_val    = PETSC_FALSE;
165     b->free_colidx = PETSC_FALSE;
166   }
167 
168   b->nz               = 0;
169   b->maxallocrow      = maxallocrow;
170   b->rlenmax          = maxallocrow;
171   b->maxallocmat      = b->sliidx[totalslices];
172   B->info.nz_unneeded = (double)b->maxallocmat;
173   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
174   PetscFunctionReturn(0);
175 }
176 
177 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) {
178   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
179   PetscInt     shift;
180 
181   PetscFunctionBegin;
182   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
183   if (nz) *nz = a->rlen[row];
184   shift = a->sliidx[row >> 3] + (row & 0x07);
185   if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); }
186   if (idx) {
187     PetscInt j;
188     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j];
189     *idx = a->getrowcols;
190   }
191   if (v) {
192     PetscInt j;
193     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j];
194     *v = a->getrowvals;
195   }
196   PetscFunctionReturn(0);
197 }
198 
199 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) {
200   PetscFunctionBegin;
201   PetscFunctionReturn(0);
202 }
203 
204 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) {
205   Mat          B;
206   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
207   PetscInt     i;
208 
209   PetscFunctionBegin;
210   if (reuse == MAT_REUSE_MATRIX) {
211     B = *newmat;
212     PetscCall(MatZeroEntries(B));
213   } else {
214     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
215     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
216     PetscCall(MatSetType(B, MATSEQAIJ));
217     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
218   }
219 
220   for (i = 0; i < A->rmap->n; i++) {
221     PetscInt     nz = 0, *cols = NULL;
222     PetscScalar *vals = NULL;
223 
224     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
225     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
226     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
227   }
228 
229   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
230   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
231   B->rmap->bs = A->rmap->bs;
232 
233   if (reuse == MAT_INPLACE_MATRIX) {
234     PetscCall(MatHeaderReplace(A, &B));
235   } else {
236     *newmat = B;
237   }
238   PetscFunctionReturn(0);
239 }
240 
241 #include <../src/mat/impls/aij/seq/aij.h>
242 
243 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) {
244   Mat                B;
245   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
246   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
247   const PetscInt    *cols;
248   const PetscScalar *vals;
249 
250   PetscFunctionBegin;
251 
252   if (reuse == MAT_REUSE_MATRIX) {
253     B = *newmat;
254   } else {
255     if (PetscDefined(USE_DEBUG) || !a->ilen) {
256       PetscCall(PetscMalloc1(m, &rowlengths));
257       for (i = 0; i < m; i++) { rowlengths[i] = ai[i + 1] - ai[i]; }
258     }
259     if (PetscDefined(USE_DEBUG) && a->ilen) {
260       PetscBool eq;
261       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
262       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
263       PetscCall(PetscFree(rowlengths));
264       rowlengths = a->ilen;
265     } else if (a->ilen) rowlengths = a->ilen;
266     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
267     PetscCall(MatSetSizes(B, m, n, m, n));
268     PetscCall(MatSetType(B, MATSEQSELL));
269     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
270     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
271   }
272 
273   for (row = 0; row < m; row++) {
274     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
275     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
276     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
277   }
278   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
279   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
280   B->rmap->bs = A->rmap->bs;
281 
282   if (reuse == MAT_INPLACE_MATRIX) {
283     PetscCall(MatHeaderReplace(A, &B));
284   } else {
285     *newmat = B;
286   }
287   PetscFunctionReturn(0);
288 }
289 
290 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) {
291   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
292   PetscScalar       *y;
293   const PetscScalar *x;
294   const MatScalar   *aval        = a->val;
295   PetscInt           totalslices = a->totalslices;
296   const PetscInt    *acolidx     = a->colidx;
297   PetscInt           i, j;
298 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
299   __m512d  vec_x, vec_y, vec_vals;
300   __m256i  vec_idx;
301   __mmask8 mask;
302   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
303   __m256i  vec_idx2, vec_idx3, vec_idx4;
304 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
305   __m128i   vec_idx;
306   __m256d   vec_x, vec_y, vec_y2, vec_vals;
307   MatScalar yval;
308   PetscInt  r, rows_left, row, nnz_in_row;
309 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
310   __m128d   vec_x_tmp;
311   __m256d   vec_x, vec_y, vec_y2, vec_vals;
312   MatScalar yval;
313   PetscInt  r, rows_left, row, nnz_in_row;
314 #else
315   PetscScalar sum[8];
316 #endif
317 
318 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
319 #pragma disjoint(*x, *y, *aval)
320 #endif
321 
322   PetscFunctionBegin;
323   PetscCall(VecGetArrayRead(xx, &x));
324   PetscCall(VecGetArray(yy, &y));
325 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
326   for (i = 0; i < totalslices; i++) { /* loop over slices */
327     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
328     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
329 
330     vec_y  = _mm512_setzero_pd();
331     vec_y2 = _mm512_setzero_pd();
332     vec_y3 = _mm512_setzero_pd();
333     vec_y4 = _mm512_setzero_pd();
334 
335     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */
336     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
337     case 3:
338       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
339       acolidx += 8;
340       aval += 8;
341       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
342       acolidx += 8;
343       aval += 8;
344       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
345       acolidx += 8;
346       aval += 8;
347       j += 3;
348       break;
349     case 2:
350       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
351       acolidx += 8;
352       aval += 8;
353       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
354       acolidx += 8;
355       aval += 8;
356       j += 2;
357       break;
358     case 1:
359       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
360       acolidx += 8;
361       aval += 8;
362       j += 1;
363       break;
364     }
365 #pragma novector
366     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
367       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
368       acolidx += 8;
369       aval += 8;
370       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
371       acolidx += 8;
372       aval += 8;
373       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
374       acolidx += 8;
375       aval += 8;
376       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
377       acolidx += 8;
378       aval += 8;
379     }
380 
381     vec_y = _mm512_add_pd(vec_y, vec_y2);
382     vec_y = _mm512_add_pd(vec_y, vec_y3);
383     vec_y = _mm512_add_pd(vec_y, vec_y4);
384     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
385       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
386       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
387     } else {
388       _mm512_storeu_pd(&y[8 * i], vec_y);
389     }
390   }
391 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
392   for (i = 0; i < totalslices; i++) { /* loop over full slices */
393     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
394     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
395 
396     /* last slice may have padding rows. Don't use vectorization. */
397     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
398       rows_left = A->rmap->n - 8 * i;
399       for (r = 0; r < rows_left; ++r) {
400         yval       = (MatScalar)0;
401         row        = 8 * i + r;
402         nnz_in_row = a->rlen[row];
403         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
404         y[row] = yval;
405       }
406       break;
407     }
408 
409     vec_y  = _mm256_setzero_pd();
410     vec_y2 = _mm256_setzero_pd();
411 
412 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
413 #pragma novector
414 #pragma unroll(2)
415     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
416       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
417       aval += 4;
418       acolidx += 4;
419       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
420       aval += 4;
421       acolidx += 4;
422     }
423 
424     _mm256_storeu_pd(y + i * 8, vec_y);
425     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
426   }
427 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
428   for (i = 0; i < totalslices; i++) { /* loop over full slices */
429     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
430     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
431 
432     vec_y  = _mm256_setzero_pd();
433     vec_y2 = _mm256_setzero_pd();
434 
435     /* last slice may have padding rows. Don't use vectorization. */
436     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
437       rows_left = A->rmap->n - 8 * i;
438       for (r = 0; r < rows_left; ++r) {
439         yval       = (MatScalar)0;
440         row        = 8 * i + r;
441         nnz_in_row = a->rlen[row];
442         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
443         y[row] = yval;
444       }
445       break;
446     }
447 
448 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
449 #pragma novector
450 #pragma unroll(2)
451     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
452       vec_vals  = _mm256_loadu_pd(aval);
453       vec_x_tmp = _mm_setzero_pd();
454       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
455       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
456       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
457       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
458       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
459       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
460       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
461       aval += 4;
462 
463       vec_vals  = _mm256_loadu_pd(aval);
464       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
465       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
466       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
467       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
468       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
469       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
470       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
471       aval += 4;
472     }
473 
474     _mm256_storeu_pd(y + i * 8, vec_y);
475     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
476   }
477 #else
478   for (i = 0; i < totalslices; i++) { /* loop over slices */
479     for (j = 0; j < 8; j++) sum[j] = 0.0;
480     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
481       sum[0] += aval[j] * x[acolidx[j]];
482       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
483       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
484       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
485       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
486       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
487       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
488       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
489     }
490     if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
491       for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j];
492     } else {
493       for (j = 0; j < 8; j++) y[8 * i + j] = sum[j];
494     }
495   }
496 #endif
497 
498   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
499   PetscCall(VecRestoreArrayRead(xx, &x));
500   PetscCall(VecRestoreArray(yy, &y));
501   PetscFunctionReturn(0);
502 }
503 
504 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
505 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) {
506   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
507   PetscScalar       *y, *z;
508   const PetscScalar *x;
509   const MatScalar   *aval        = a->val;
510   PetscInt           totalslices = a->totalslices;
511   const PetscInt    *acolidx     = a->colidx;
512   PetscInt           i, j;
513 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
514   __m512d  vec_x, vec_y, vec_vals;
515   __m256i  vec_idx;
516   __mmask8 mask;
517   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
518   __m256i  vec_idx2, vec_idx3, vec_idx4;
519 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
520   __m128d   vec_x_tmp;
521   __m256d   vec_x, vec_y, vec_y2, vec_vals;
522   MatScalar yval;
523   PetscInt  r, row, nnz_in_row;
524 #else
525   PetscScalar sum[8];
526 #endif
527 
528 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
529 #pragma disjoint(*x, *y, *aval)
530 #endif
531 
532   PetscFunctionBegin;
533   PetscCall(VecGetArrayRead(xx, &x));
534   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
535 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
536   for (i = 0; i < totalslices; i++) { /* loop over slices */
537     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
538     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
539 
540     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
541       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
542       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
543     } else {
544       vec_y = _mm512_loadu_pd(&y[8 * i]);
545     }
546     vec_y2 = _mm512_setzero_pd();
547     vec_y3 = _mm512_setzero_pd();
548     vec_y4 = _mm512_setzero_pd();
549 
550     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */
551     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
552     case 3:
553       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
554       acolidx += 8;
555       aval += 8;
556       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
557       acolidx += 8;
558       aval += 8;
559       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
560       acolidx += 8;
561       aval += 8;
562       j += 3;
563       break;
564     case 2:
565       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
566       acolidx += 8;
567       aval += 8;
568       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
569       acolidx += 8;
570       aval += 8;
571       j += 2;
572       break;
573     case 1:
574       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
575       acolidx += 8;
576       aval += 8;
577       j += 1;
578       break;
579     }
580 #pragma novector
581     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
582       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
583       acolidx += 8;
584       aval += 8;
585       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
586       acolidx += 8;
587       aval += 8;
588       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
589       acolidx += 8;
590       aval += 8;
591       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
592       acolidx += 8;
593       aval += 8;
594     }
595 
596     vec_y = _mm512_add_pd(vec_y, vec_y2);
597     vec_y = _mm512_add_pd(vec_y, vec_y3);
598     vec_y = _mm512_add_pd(vec_y, vec_y4);
599     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
600       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
601     } else {
602       _mm512_storeu_pd(&z[8 * i], vec_y);
603     }
604   }
605 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
606   for (i = 0; i < totalslices; i++) { /* loop over full slices */
607     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
608     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
609 
610     /* last slice may have padding rows. Don't use vectorization. */
611     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
612       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
613         row        = 8 * i + r;
614         yval       = (MatScalar)0.0;
615         nnz_in_row = a->rlen[row];
616         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
617         z[row] = y[row] + yval;
618       }
619       break;
620     }
621 
622     vec_y  = _mm256_loadu_pd(y + 8 * i);
623     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
624 
625     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
626     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
627       vec_vals  = _mm256_loadu_pd(aval);
628       vec_x_tmp = _mm_setzero_pd();
629       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
630       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
631       vec_x     = _mm256_setzero_pd();
632       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
633       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
634       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
635       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
636       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
637       aval += 4;
638 
639       vec_vals  = _mm256_loadu_pd(aval);
640       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
641       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
642       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
643       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
644       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
645       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
646       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
647       aval += 4;
648     }
649 
650     _mm256_storeu_pd(z + i * 8, vec_y);
651     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
652   }
653 #else
654   for (i = 0; i < totalslices; i++) { /* loop over slices */
655     for (j = 0; j < 8; j++) sum[j] = 0.0;
656     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
657       sum[0] += aval[j] * x[acolidx[j]];
658       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
659       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
660       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
661       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
662       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
663       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
664       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
665     }
666     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
667       for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j];
668     } else {
669       for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j];
670     }
671   }
672 #endif
673 
674   PetscCall(PetscLogFlops(2.0 * a->nz));
675   PetscCall(VecRestoreArrayRead(xx, &x));
676   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
677   PetscFunctionReturn(0);
678 }
679 
680 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) {
681   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
682   PetscScalar       *y;
683   const PetscScalar *x;
684   const MatScalar   *aval    = a->val;
685   const PetscInt    *acolidx = a->colidx;
686   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices;
687 
688 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
689 #pragma disjoint(*x, *y, *aval)
690 #endif
691 
692   PetscFunctionBegin;
693   if (A->symmetric == PETSC_BOOL3_TRUE) {
694     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
695     PetscFunctionReturn(0);
696   }
697   if (zz != yy) PetscCall(VecCopy(zz, yy));
698   PetscCall(VecGetArrayRead(xx, &x));
699   PetscCall(VecGetArray(yy, &y));
700   for (i = 0; i < a->totalslices; i++) { /* loop over slices */
701     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
702       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
703         row        = 8 * i + r;
704         nnz_in_row = a->rlen[row];
705         for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row];
706       }
707       break;
708     }
709     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
710       y[acolidx[j]] += aval[j] * x[8 * i];
711       y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1];
712       y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2];
713       y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3];
714       y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4];
715       y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5];
716       y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6];
717       y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7];
718     }
719   }
720   PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices]));
721   PetscCall(VecRestoreArrayRead(xx, &x));
722   PetscCall(VecRestoreArray(yy, &y));
723   PetscFunctionReturn(0);
724 }
725 
726 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) {
727   PetscFunctionBegin;
728   if (A->symmetric == PETSC_BOOL3_TRUE) {
729     PetscCall(MatMult_SeqSELL(A, xx, yy));
730   } else {
731     PetscCall(VecSet(yy, 0.0));
732     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
733   }
734   PetscFunctionReturn(0);
735 }
736 
737 /*
738      Checks for missing diagonals
739 */
740 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) {
741   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
742   PetscInt    *diag, i;
743 
744   PetscFunctionBegin;
745   *missing = PETSC_FALSE;
746   if (A->rmap->n > 0 && !(a->colidx)) {
747     *missing = PETSC_TRUE;
748     if (d) *d = 0;
749     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
750   } else {
751     diag = a->diag;
752     for (i = 0; i < A->rmap->n; i++) {
753       if (diag[i] == -1) {
754         *missing = PETSC_TRUE;
755         if (d) *d = i;
756         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
757         break;
758       }
759     }
760   }
761   PetscFunctionReturn(0);
762 }
763 
764 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) {
765   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
766   PetscInt     i, j, m = A->rmap->n, shift;
767 
768   PetscFunctionBegin;
769   if (!a->diag) {
770     PetscCall(PetscMalloc1(m, &a->diag));
771     PetscCall(PetscLogObjectMemory((PetscObject)A, m * sizeof(PetscInt)));
772     a->free_diag = PETSC_TRUE;
773   }
774   for (i = 0; i < m; i++) {                      /* loop over rows */
775     shift      = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
776     a->diag[i] = -1;
777     for (j = 0; j < a->rlen[i]; j++) {
778       if (a->colidx[shift + j * 8] == i) {
779         a->diag[i] = shift + j * 8;
780         break;
781       }
782     }
783   }
784   PetscFunctionReturn(0);
785 }
786 
787 /*
788   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
789 */
790 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) {
791   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
792   PetscInt     i, *diag, m = A->rmap->n;
793   MatScalar   *val = a->val;
794   PetscScalar *idiag, *mdiag;
795 
796   PetscFunctionBegin;
797   if (a->idiagvalid) PetscFunctionReturn(0);
798   PetscCall(MatMarkDiagonal_SeqSELL(A));
799   diag = a->diag;
800   if (!a->idiag) {
801     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
802     PetscCall(PetscLogObjectMemory((PetscObject)A, 3 * m * sizeof(PetscScalar)));
803     val = a->val;
804   }
805   mdiag = a->mdiag;
806   idiag = a->idiag;
807 
808   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
809     for (i = 0; i < m; i++) {
810       mdiag[i] = val[diag[i]];
811       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
812         if (PetscRealPart(fshift)) {
813           PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
814           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
815           A->factorerror_zeropivot_value = 0.0;
816           A->factorerror_zeropivot_row   = i;
817         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
818       }
819       idiag[i] = 1.0 / val[diag[i]];
820     }
821     PetscCall(PetscLogFlops(m));
822   } else {
823     for (i = 0; i < m; i++) {
824       mdiag[i] = val[diag[i]];
825       idiag[i] = omega / (fshift + val[diag[i]]);
826     }
827     PetscCall(PetscLogFlops(2.0 * m));
828   }
829   a->idiagvalid = PETSC_TRUE;
830   PetscFunctionReturn(0);
831 }
832 
833 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) {
834   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
835 
836   PetscFunctionBegin;
837   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
838   PetscCall(MatSeqSELLInvalidateDiagonal(A));
839   PetscFunctionReturn(0);
840 }
841 
842 PetscErrorCode MatDestroy_SeqSELL(Mat A) {
843   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
844 
845   PetscFunctionBegin;
846 #if defined(PETSC_USE_LOG)
847   PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz);
848 #endif
849   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
850   PetscCall(ISDestroy(&a->row));
851   PetscCall(ISDestroy(&a->col));
852   PetscCall(PetscFree(a->diag));
853   PetscCall(PetscFree(a->rlen));
854   PetscCall(PetscFree(a->sliidx));
855   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
856   PetscCall(PetscFree(a->solve_work));
857   PetscCall(ISDestroy(&a->icol));
858   PetscCall(PetscFree(a->saved_values));
859   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
860 
861   PetscCall(PetscFree(A->data));
862 
863   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
864   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
865   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
866   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
867   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
868   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
869   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
870   PetscFunctionReturn(0);
871 }
872 
873 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) {
874   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
875 
876   PetscFunctionBegin;
877   switch (op) {
878   case MAT_ROW_ORIENTED: a->roworiented = flg; break;
879   case MAT_KEEP_NONZERO_PATTERN: a->keepnonzeropattern = flg; break;
880   case MAT_NEW_NONZERO_LOCATIONS: a->nonew = (flg ? 0 : 1); break;
881   case MAT_NEW_NONZERO_LOCATION_ERR: a->nonew = (flg ? -1 : 0); break;
882   case MAT_NEW_NONZERO_ALLOCATION_ERR: a->nonew = (flg ? -2 : 0); break;
883   case MAT_UNUSED_NONZERO_LOCATION_ERR: a->nounused = (flg ? -1 : 0); break;
884   case MAT_FORCE_DIAGONAL_ENTRIES:
885   case MAT_IGNORE_OFF_PROC_ENTRIES:
886   case MAT_USE_HASH_TABLE:
887   case MAT_SORTED_FULL: PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); break;
888   case MAT_SPD:
889   case MAT_SYMMETRIC:
890   case MAT_STRUCTURALLY_SYMMETRIC:
891   case MAT_HERMITIAN:
892   case MAT_SYMMETRY_ETERNAL:
893   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
894   case MAT_SPD_ETERNAL:
895     /* These options are handled directly by MatSetOption() */
896     break;
897   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
898   }
899   PetscFunctionReturn(0);
900 }
901 
902 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) {
903   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
904   PetscInt     i, j, n, shift;
905   PetscScalar *x, zero = 0.0;
906 
907   PetscFunctionBegin;
908   PetscCall(VecGetLocalSize(v, &n));
909   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
910 
911   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
912     PetscInt *diag = a->diag;
913     PetscCall(VecGetArray(v, &x));
914     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
915     PetscCall(VecRestoreArray(v, &x));
916     PetscFunctionReturn(0);
917   }
918 
919   PetscCall(VecSet(v, zero));
920   PetscCall(VecGetArray(v, &x));
921   for (i = 0; i < n; i++) {                 /* loop over rows */
922     shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
923     x[i]  = 0;
924     for (j = 0; j < a->rlen[i]; j++) {
925       if (a->colidx[shift + j * 8] == i) {
926         x[i] = a->val[shift + j * 8];
927         break;
928       }
929     }
930   }
931   PetscCall(VecRestoreArray(v, &x));
932   PetscFunctionReturn(0);
933 }
934 
935 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) {
936   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
937   const PetscScalar *l, *r;
938   PetscInt           i, j, m, n, row;
939 
940   PetscFunctionBegin;
941   if (ll) {
942     /* The local size is used so that VecMPI can be passed to this routine
943        by MatDiagonalScale_MPISELL */
944     PetscCall(VecGetLocalSize(ll, &m));
945     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
946     PetscCall(VecGetArrayRead(ll, &l));
947     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
948       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
949         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
950           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row];
951         }
952       } else {
953         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { a->val[j] *= l[8 * i + row]; }
954       }
955     }
956     PetscCall(VecRestoreArrayRead(ll, &l));
957     PetscCall(PetscLogFlops(a->nz));
958   }
959   if (rr) {
960     PetscCall(VecGetLocalSize(rr, &n));
961     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
962     PetscCall(VecGetArrayRead(rr, &r));
963     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
964       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
965         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
966           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
967         }
968       } else {
969         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) { a->val[j] *= r[a->colidx[j]]; }
970       }
971     }
972     PetscCall(VecRestoreArrayRead(rr, &r));
973     PetscCall(PetscLogFlops(a->nz));
974   }
975   PetscCall(MatSeqSELLInvalidateDiagonal(A));
976   PetscFunctionReturn(0);
977 }
978 
979 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) {
980   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
981   PetscInt    *cp, i, k, low, high, t, row, col, l;
982   PetscInt     shift;
983   MatScalar   *vp;
984 
985   PetscFunctionBegin;
986   for (k = 0; k < m; k++) { /* loop over requested rows */
987     row = im[k];
988     if (row < 0) continue;
989     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
990     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
991     cp    = a->colidx + shift;                  /* pointer to the row */
992     vp    = a->val + shift;                     /* pointer to the row */
993     for (l = 0; l < n; l++) {                   /* loop over requested columns */
994       col = in[l];
995       if (col < 0) continue;
996       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
997       high = a->rlen[row];
998       low  = 0; /* assume unsorted */
999       while (high - low > 5) {
1000         t = (low + high) / 2;
1001         if (*(cp + t * 8) > col) high = t;
1002         else low = t;
1003       }
1004       for (i = low; i < high; i++) {
1005         if (*(cp + 8 * i) > col) break;
1006         if (*(cp + 8 * i) == col) {
1007           *v++ = *(vp + 8 * i);
1008           goto finished;
1009         }
1010       }
1011       *v++ = 0.0;
1012     finished:;
1013     }
1014   }
1015   PetscFunctionReturn(0);
1016 }
1017 
1018 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) {
1019   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1020   PetscInt          i, j, m = A->rmap->n, shift;
1021   const char       *name;
1022   PetscViewerFormat format;
1023 
1024   PetscFunctionBegin;
1025   PetscCall(PetscViewerGetFormat(viewer, &format));
1026   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1027     PetscInt nofinalvalue = 0;
1028     /*
1029     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1030       nofinalvalue = 1;
1031     }
1032     */
1033     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1034     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1035     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1036 #if defined(PETSC_USE_COMPLEX)
1037     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1038 #else
1039     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1040 #endif
1041     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1042 
1043     for (i = 0; i < m; i++) {
1044       shift = a->sliidx[i >> 3] + (i & 0x07);
1045       for (j = 0; j < a->rlen[i]; j++) {
1046 #if defined(PETSC_USE_COMPLEX)
1047         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1048 #else
1049         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j]));
1050 #endif
1051       }
1052     }
1053     /*
1054     if (nofinalvalue) {
1055 #if defined(PETSC_USE_COMPLEX)
1056       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1057 #else
1058       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1059 #endif
1060     }
1061     */
1062     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1063     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1064     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1065   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1066     PetscFunctionReturn(0);
1067   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1068     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1069     for (i = 0; i < m; i++) {
1070       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1071       shift = a->sliidx[i >> 3] + (i & 0x07);
1072       for (j = 0; j < a->rlen[i]; j++) {
1073 #if defined(PETSC_USE_COMPLEX)
1074         if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1075           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1076         } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1077           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1078         } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1079           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1080         }
1081 #else
1082         if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1083 #endif
1084       }
1085       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1086     }
1087     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1088   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1089     PetscInt    cnt = 0, jcnt;
1090     PetscScalar value;
1091 #if defined(PETSC_USE_COMPLEX)
1092     PetscBool realonly = PETSC_TRUE;
1093     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1094       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1095         realonly = PETSC_FALSE;
1096         break;
1097       }
1098     }
1099 #endif
1100 
1101     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1102     for (i = 0; i < m; i++) {
1103       jcnt  = 0;
1104       shift = a->sliidx[i >> 3] + (i & 0x07);
1105       for (j = 0; j < A->cmap->n; j++) {
1106         if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) {
1107           value = a->val[cnt++];
1108           jcnt++;
1109         } else {
1110           value = 0.0;
1111         }
1112 #if defined(PETSC_USE_COMPLEX)
1113         if (realonly) {
1114           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1115         } else {
1116           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1117         }
1118 #else
1119         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1120 #endif
1121       }
1122       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1123     }
1124     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1125   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1126     PetscInt fshift = 1;
1127     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1128 #if defined(PETSC_USE_COMPLEX)
1129     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1130 #else
1131     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1132 #endif
1133     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1134     for (i = 0; i < m; i++) {
1135       shift = a->sliidx[i >> 3] + (i & 0x07);
1136       for (j = 0; j < a->rlen[i]; j++) {
1137 #if defined(PETSC_USE_COMPLEX)
1138         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1139 #else
1140         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j]));
1141 #endif
1142       }
1143     }
1144     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1145   } else if (format == PETSC_VIEWER_NATIVE) {
1146     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1147       PetscInt row;
1148       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1149       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
1150 #if defined(PETSC_USE_COMPLEX)
1151         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1152           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1153         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1154           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1155         } else {
1156           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1157         }
1158 #else
1159         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j]));
1160 #endif
1161       }
1162     }
1163   } else {
1164     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1165     if (A->factortype) {
1166       for (i = 0; i < m; i++) {
1167         shift = a->sliidx[i >> 3] + (i & 0x07);
1168         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1169         /* L part */
1170         for (j = shift; j < a->diag[i]; j += 8) {
1171 #if defined(PETSC_USE_COMPLEX)
1172           if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) {
1173             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1174           } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) {
1175             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1176           } else {
1177             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1178           }
1179 #else
1180           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1181 #endif
1182         }
1183         /* diagonal */
1184         j = a->diag[i];
1185 #if defined(PETSC_USE_COMPLEX)
1186         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1187           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1188         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1189           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1190         } else {
1191           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1192         }
1193 #else
1194         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1195 #endif
1196 
1197         /* U part */
1198         for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) {
1199 #if defined(PETSC_USE_COMPLEX)
1200           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1201             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1202           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1203             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1204           } else {
1205             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1206           }
1207 #else
1208           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1209 #endif
1210         }
1211         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1212       }
1213     } else {
1214       for (i = 0; i < m; i++) {
1215         shift = a->sliidx[i >> 3] + (i & 0x07);
1216         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1217         for (j = 0; j < a->rlen[i]; j++) {
1218 #if defined(PETSC_USE_COMPLEX)
1219           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1220             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1221           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1222             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1223           } else {
1224             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1225           }
1226 #else
1227           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1228 #endif
1229         }
1230         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1231       }
1232     }
1233     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1234   }
1235   PetscCall(PetscViewerFlush(viewer));
1236   PetscFunctionReturn(0);
1237 }
1238 
1239 #include <petscdraw.h>
1240 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) {
1241   Mat               A = (Mat)Aa;
1242   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1243   PetscInt          i, j, m = A->rmap->n, shift;
1244   int               color;
1245   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1246   PetscViewer       viewer;
1247   PetscViewerFormat format;
1248 
1249   PetscFunctionBegin;
1250   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1251   PetscCall(PetscViewerGetFormat(viewer, &format));
1252   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1253 
1254   /* loop over matrix elements drawing boxes */
1255 
1256   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1257     PetscDrawCollectiveBegin(draw);
1258     /* Blue for negative, Cyan for zero and  Red for positive */
1259     color = PETSC_DRAW_BLUE;
1260     for (i = 0; i < m; i++) {
1261       shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1262       y_l   = m - i - 1.0;
1263       y_r   = y_l + 1.0;
1264       for (j = 0; j < a->rlen[i]; j++) {
1265         x_l = a->colidx[shift + j * 8];
1266         x_r = x_l + 1.0;
1267         if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue;
1268         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1269       }
1270     }
1271     color = PETSC_DRAW_CYAN;
1272     for (i = 0; i < m; i++) {
1273       shift = a->sliidx[i >> 3] + (i & 0x07);
1274       y_l   = m - i - 1.0;
1275       y_r   = y_l + 1.0;
1276       for (j = 0; j < a->rlen[i]; j++) {
1277         x_l = a->colidx[shift + j * 8];
1278         x_r = x_l + 1.0;
1279         if (a->val[shift + 8 * j] != 0.) continue;
1280         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1281       }
1282     }
1283     color = PETSC_DRAW_RED;
1284     for (i = 0; i < m; i++) {
1285       shift = a->sliidx[i >> 3] + (i & 0x07);
1286       y_l   = m - i - 1.0;
1287       y_r   = y_l + 1.0;
1288       for (j = 0; j < a->rlen[i]; j++) {
1289         x_l = a->colidx[shift + j * 8];
1290         x_r = x_l + 1.0;
1291         if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue;
1292         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1293       }
1294     }
1295     PetscDrawCollectiveEnd(draw);
1296   } else {
1297     /* use contour shading to indicate magnitude of values */
1298     /* first determine max of all nonzero values */
1299     PetscReal minv = 0.0, maxv = 0.0;
1300     PetscInt  count = 0;
1301     PetscDraw popup;
1302     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1303       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1304     }
1305     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1306     PetscCall(PetscDrawGetPopup(draw, &popup));
1307     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1308 
1309     PetscDrawCollectiveBegin(draw);
1310     for (i = 0; i < m; i++) {
1311       shift = a->sliidx[i >> 3] + (i & 0x07);
1312       y_l   = m - i - 1.0;
1313       y_r   = y_l + 1.0;
1314       for (j = 0; j < a->rlen[i]; j++) {
1315         x_l   = a->colidx[shift + j * 8];
1316         x_r   = x_l + 1.0;
1317         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1318         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1319         count++;
1320       }
1321     }
1322     PetscDrawCollectiveEnd(draw);
1323   }
1324   PetscFunctionReturn(0);
1325 }
1326 
1327 #include <petscdraw.h>
1328 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) {
1329   PetscDraw draw;
1330   PetscReal xr, yr, xl, yl, h, w;
1331   PetscBool isnull;
1332 
1333   PetscFunctionBegin;
1334   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1335   PetscCall(PetscDrawIsNull(draw, &isnull));
1336   if (isnull) PetscFunctionReturn(0);
1337 
1338   xr = A->cmap->n;
1339   yr = A->rmap->n;
1340   h  = yr / 10.0;
1341   w  = xr / 10.0;
1342   xr += w;
1343   yr += h;
1344   xl = -w;
1345   yl = -h;
1346   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1347   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1348   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1349   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1350   PetscCall(PetscDrawSave(draw));
1351   PetscFunctionReturn(0);
1352 }
1353 
1354 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) {
1355   PetscBool iascii, isbinary, isdraw;
1356 
1357   PetscFunctionBegin;
1358   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1359   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1360   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1361   if (iascii) {
1362     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1363   } else if (isbinary) {
1364     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1365   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1366   PetscFunctionReturn(0);
1367 }
1368 
1369 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) {
1370   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1371   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1372   MatScalar   *vp;
1373 
1374   PetscFunctionBegin;
1375   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1376   /* To do: compress out the unused elements */
1377   PetscCall(MatMarkDiagonal_SeqSELL(A));
1378   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1379   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1380   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1381   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1382   for (i = 0; i < a->totalslices; ++i) {
1383     shift = a->sliidx[i];                                      /* starting index of the slice */
1384     cp    = a->colidx + shift;                                 /* pointer to the column indices of the slice */
1385     vp    = a->val + shift;                                    /* pointer to the nonzero values of the slice */
1386     for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */
1387       row     = 8 * i + row_in_slice;
1388       nrow    = a->rlen[row]; /* number of nonzeros in row */
1389       /*
1390         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1391         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1392       */
1393       lastcol = 0;
1394       if (nrow > 0) {                                /* nonempty row */
1395         lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1396       } else if (!row_in_slice) {                    /* first row of the currect slice is empty */
1397         for (j = 1; j < 8; j++) {
1398           if (a->rlen[8 * i + j]) {
1399             lastcol = cp[j];
1400             break;
1401           }
1402         }
1403       } else {
1404         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1405       }
1406 
1407       for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) {
1408         cp[8 * k + row_in_slice] = lastcol;
1409         vp[8 * k + row_in_slice] = (MatScalar)0;
1410       }
1411     }
1412   }
1413 
1414   A->info.mallocs += a->reallocs;
1415   a->reallocs = 0;
1416 
1417   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1418   PetscFunctionReturn(0);
1419 }
1420 
1421 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) {
1422   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1423 
1424   PetscFunctionBegin;
1425   info->block_size   = 1.0;
1426   info->nz_allocated = a->maxallocmat;
1427   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1428   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1429   info->assemblies   = A->num_ass;
1430   info->mallocs      = A->info.mallocs;
1431   info->memory       = ((PetscObject)A)->mem;
1432   if (A->factortype) {
1433     info->fill_ratio_given  = A->info.fill_ratio_given;
1434     info->fill_ratio_needed = A->info.fill_ratio_needed;
1435     info->factor_mallocs    = A->info.factor_mallocs;
1436   } else {
1437     info->fill_ratio_given  = 0;
1438     info->fill_ratio_needed = 0;
1439     info->factor_mallocs    = 0;
1440   }
1441   PetscFunctionReturn(0);
1442 }
1443 
1444 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) {
1445   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1446   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1447   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1448   MatScalar   *vp, value;
1449 
1450   PetscFunctionBegin;
1451   for (k = 0; k < m; k++) { /* loop over added rows */
1452     row = im[k];
1453     if (row < 0) continue;
1454     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1455     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1456     cp    = a->colidx + shift;                  /* pointer to the row */
1457     vp    = a->val + shift;                     /* pointer to the row */
1458     nrow  = a->rlen[row];
1459     low   = 0;
1460     high  = nrow;
1461 
1462     for (l = 0; l < n; l++) { /* loop over added columns */
1463       col = in[l];
1464       if (col < 0) continue;
1465       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1466       if (a->roworiented) {
1467         value = v[l + k * n];
1468       } else {
1469         value = v[k + l * m];
1470       }
1471       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1472 
1473       /* search in this row for the specified column, i indicates the column to be set */
1474       if (col <= lastcol) low = 0;
1475       else high = nrow;
1476       lastcol = col;
1477       while (high - low > 5) {
1478         t = (low + high) / 2;
1479         if (*(cp + t * 8) > col) high = t;
1480         else low = t;
1481       }
1482       for (i = low; i < high; i++) {
1483         if (*(cp + i * 8) > col) break;
1484         if (*(cp + i * 8) == col) {
1485           if (is == ADD_VALUES) *(vp + i * 8) += value;
1486           else *(vp + i * 8) = value;
1487           low = i + 1;
1488           goto noinsert;
1489         }
1490       }
1491       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1492       if (nonew == 1) goto noinsert;
1493       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1494       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1495       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar);
1496       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1497       for (ii = nrow - 1; ii >= i; ii--) {
1498         *(cp + (ii + 1) * 8) = *(cp + ii * 8);
1499         *(vp + (ii + 1) * 8) = *(vp + ii * 8);
1500       }
1501       a->rlen[row]++;
1502       *(cp + i * 8) = col;
1503       *(vp + i * 8) = value;
1504       a->nz++;
1505       A->nonzerostate++;
1506       low = i + 1;
1507       high++;
1508       nrow++;
1509     noinsert:;
1510     }
1511     a->rlen[row] = nrow;
1512   }
1513   PetscFunctionReturn(0);
1514 }
1515 
1516 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) {
1517   PetscFunctionBegin;
1518   /* If the two matrices have the same copy implementation, use fast copy. */
1519   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1520     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1521     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1522 
1523     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1524     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1525   } else {
1526     PetscCall(MatCopy_Basic(A, B, str));
1527   }
1528   PetscFunctionReturn(0);
1529 }
1530 
1531 PetscErrorCode MatSetUp_SeqSELL(Mat A) {
1532   PetscFunctionBegin;
1533   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1534   PetscFunctionReturn(0);
1535 }
1536 
1537 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) {
1538   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1539 
1540   PetscFunctionBegin;
1541   *array = a->val;
1542   PetscFunctionReturn(0);
1543 }
1544 
1545 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) {
1546   PetscFunctionBegin;
1547   PetscFunctionReturn(0);
1548 }
1549 
1550 PetscErrorCode MatRealPart_SeqSELL(Mat A) {
1551   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1552   PetscInt     i;
1553   MatScalar   *aval = a->val;
1554 
1555   PetscFunctionBegin;
1556   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1557   PetscFunctionReturn(0);
1558 }
1559 
1560 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) {
1561   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1562   PetscInt     i;
1563   MatScalar   *aval = a->val;
1564 
1565   PetscFunctionBegin;
1566   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1567   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1568   PetscFunctionReturn(0);
1569 }
1570 
1571 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) {
1572   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1573   MatScalar   *aval   = a->val;
1574   PetscScalar  oalpha = alpha;
1575   PetscBLASInt one    = 1, size;
1576 
1577   PetscFunctionBegin;
1578   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1579   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1580   PetscCall(PetscLogFlops(a->nz));
1581   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1582   PetscFunctionReturn(0);
1583 }
1584 
1585 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) {
1586   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1587 
1588   PetscFunctionBegin;
1589   if (!Y->preallocated || !y->nz) { PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); }
1590   PetscCall(MatShift_Basic(Y, a));
1591   PetscFunctionReturn(0);
1592 }
1593 
1594 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) {
1595   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1596   PetscScalar       *x, sum, *t;
1597   const MatScalar   *idiag = NULL, *mdiag;
1598   const PetscScalar *b, *xb;
1599   PetscInt           n, m = A->rmap->n, i, j, shift;
1600   const PetscInt    *diag;
1601 
1602   PetscFunctionBegin;
1603   its = its * lits;
1604 
1605   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1606   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1607   a->fshift = fshift;
1608   a->omega  = omega;
1609 
1610   diag  = a->diag;
1611   t     = a->ssor_work;
1612   idiag = a->idiag;
1613   mdiag = a->mdiag;
1614 
1615   PetscCall(VecGetArray(xx, &x));
1616   PetscCall(VecGetArrayRead(bb, &b));
1617   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1618   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1619   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1620   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1621 
1622   if (flag & SOR_ZERO_INITIAL_GUESS) {
1623     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1624       for (i = 0; i < m; i++) {
1625         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1626         sum   = b[i];
1627         n     = (diag[i] - shift) / 8;
1628         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1629         t[i] = sum;
1630         x[i] = sum * idiag[i];
1631       }
1632       xb = t;
1633       PetscCall(PetscLogFlops(a->nz));
1634     } else xb = b;
1635     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1636       for (i = m - 1; i >= 0; i--) {
1637         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1638         sum   = xb[i];
1639         n     = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1640         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1641         if (xb == b) {
1642           x[i] = sum * idiag[i];
1643         } else {
1644           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1645         }
1646       }
1647       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1648     }
1649     its--;
1650   }
1651   while (its--) {
1652     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1653       for (i = 0; i < m; i++) {
1654         /* lower */
1655         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1656         sum   = b[i];
1657         n     = (diag[i] - shift) / 8;
1658         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1659         t[i] = sum; /* save application of the lower-triangular part */
1660         /* upper */
1661         n    = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1662         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1663         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1664       }
1665       xb = t;
1666       PetscCall(PetscLogFlops(2.0 * a->nz));
1667     } else xb = b;
1668     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1669       for (i = m - 1; i >= 0; i--) {
1670         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1671         sum   = xb[i];
1672         if (xb == b) {
1673           /* whole matrix (no checkpointing available) */
1674           n = a->rlen[i];
1675           for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1676           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1677         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1678           n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1679           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1680           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1681         }
1682       }
1683       if (xb == b) {
1684         PetscCall(PetscLogFlops(2.0 * a->nz));
1685       } else {
1686         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1687       }
1688     }
1689   }
1690   PetscCall(VecRestoreArray(xx, &x));
1691   PetscCall(VecRestoreArrayRead(bb, &b));
1692   PetscFunctionReturn(0);
1693 }
1694 
1695 /* -------------------------------------------------------------------*/
1696 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1697                                        MatGetRow_SeqSELL,
1698                                        MatRestoreRow_SeqSELL,
1699                                        MatMult_SeqSELL,
1700                                        /* 4*/ MatMultAdd_SeqSELL,
1701                                        MatMultTranspose_SeqSELL,
1702                                        MatMultTransposeAdd_SeqSELL,
1703                                        NULL,
1704                                        NULL,
1705                                        NULL,
1706                                        /* 10*/ NULL,
1707                                        NULL,
1708                                        NULL,
1709                                        MatSOR_SeqSELL,
1710                                        NULL,
1711                                        /* 15*/ MatGetInfo_SeqSELL,
1712                                        MatEqual_SeqSELL,
1713                                        MatGetDiagonal_SeqSELL,
1714                                        MatDiagonalScale_SeqSELL,
1715                                        NULL,
1716                                        /* 20*/ NULL,
1717                                        MatAssemblyEnd_SeqSELL,
1718                                        MatSetOption_SeqSELL,
1719                                        MatZeroEntries_SeqSELL,
1720                                        /* 24*/ NULL,
1721                                        NULL,
1722                                        NULL,
1723                                        NULL,
1724                                        NULL,
1725                                        /* 29*/ MatSetUp_SeqSELL,
1726                                        NULL,
1727                                        NULL,
1728                                        NULL,
1729                                        NULL,
1730                                        /* 34*/ MatDuplicate_SeqSELL,
1731                                        NULL,
1732                                        NULL,
1733                                        NULL,
1734                                        NULL,
1735                                        /* 39*/ NULL,
1736                                        NULL,
1737                                        NULL,
1738                                        MatGetValues_SeqSELL,
1739                                        MatCopy_SeqSELL,
1740                                        /* 44*/ NULL,
1741                                        MatScale_SeqSELL,
1742                                        MatShift_SeqSELL,
1743                                        NULL,
1744                                        NULL,
1745                                        /* 49*/ NULL,
1746                                        NULL,
1747                                        NULL,
1748                                        NULL,
1749                                        NULL,
1750                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1751                                        NULL,
1752                                        NULL,
1753                                        NULL,
1754                                        NULL,
1755                                        /* 59*/ NULL,
1756                                        MatDestroy_SeqSELL,
1757                                        MatView_SeqSELL,
1758                                        NULL,
1759                                        NULL,
1760                                        /* 64*/ NULL,
1761                                        NULL,
1762                                        NULL,
1763                                        NULL,
1764                                        NULL,
1765                                        /* 69*/ NULL,
1766                                        NULL,
1767                                        NULL,
1768                                        NULL,
1769                                        NULL,
1770                                        /* 74*/ NULL,
1771                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1772                                        NULL,
1773                                        NULL,
1774                                        NULL,
1775                                        /* 79*/ NULL,
1776                                        NULL,
1777                                        NULL,
1778                                        NULL,
1779                                        NULL,
1780                                        /* 84*/ NULL,
1781                                        NULL,
1782                                        NULL,
1783                                        NULL,
1784                                        NULL,
1785                                        /* 89*/ NULL,
1786                                        NULL,
1787                                        NULL,
1788                                        NULL,
1789                                        NULL,
1790                                        /* 94*/ NULL,
1791                                        NULL,
1792                                        NULL,
1793                                        NULL,
1794                                        NULL,
1795                                        /* 99*/ NULL,
1796                                        NULL,
1797                                        NULL,
1798                                        MatConjugate_SeqSELL,
1799                                        NULL,
1800                                        /*104*/ NULL,
1801                                        NULL,
1802                                        NULL,
1803                                        NULL,
1804                                        NULL,
1805                                        /*109*/ NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        NULL,
1809                                        MatMissingDiagonal_SeqSELL,
1810                                        /*114*/ NULL,
1811                                        NULL,
1812                                        NULL,
1813                                        NULL,
1814                                        NULL,
1815                                        /*119*/ NULL,
1816                                        NULL,
1817                                        NULL,
1818                                        NULL,
1819                                        NULL,
1820                                        /*124*/ NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        NULL,
1824                                        NULL,
1825                                        /*129*/ NULL,
1826                                        NULL,
1827                                        NULL,
1828                                        NULL,
1829                                        NULL,
1830                                        /*134*/ NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        NULL,
1834                                        NULL,
1835                                        /*139*/ NULL,
1836                                        NULL,
1837                                        NULL,
1838                                        MatFDColoringSetUp_SeqXAIJ,
1839                                        NULL,
1840                                        /*144*/ NULL,
1841                                        NULL,
1842                                        NULL,
1843                                        NULL,
1844                                        NULL,
1845                                        NULL,
1846                                        /*150*/ NULL};
1847 
1848 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) {
1849   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1850 
1851   PetscFunctionBegin;
1852   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1853 
1854   /* allocate space for values if not already there */
1855   if (!a->saved_values) {
1856     PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1857     PetscCall(PetscLogObjectMemory((PetscObject)mat, (a->sliidx[a->totalslices] + 1) * sizeof(PetscScalar)));
1858   }
1859 
1860   /* copy values over */
1861   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1862   PetscFunctionReturn(0);
1863 }
1864 
1865 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) {
1866   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1867 
1868   PetscFunctionBegin;
1869   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1870   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1871   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1872   PetscFunctionReturn(0);
1873 }
1874 
1875 /*@C
1876  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1877 
1878  Not Collective
1879 
1880  Input Parameters:
1881  .  mat - a MATSEQSELL matrix
1882  .  array - pointer to the data
1883 
1884  Level: intermediate
1885 
1886  .seealso: `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1887  @*/
1888 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) {
1889   PetscFunctionBegin;
1890   PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
1891   PetscFunctionReturn(0);
1892 }
1893 
1894 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) {
1895   Mat_SeqSELL *b;
1896   PetscMPIInt  size;
1897 
1898   PetscFunctionBegin;
1899   PetscCall(PetscCitationsRegister(citation, &cited));
1900   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
1901   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
1902 
1903   PetscCall(PetscNewLog(B, &b));
1904 
1905   B->data = (void *)b;
1906 
1907   PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
1908 
1909   b->row                = NULL;
1910   b->col                = NULL;
1911   b->icol               = NULL;
1912   b->reallocs           = 0;
1913   b->ignorezeroentries  = PETSC_FALSE;
1914   b->roworiented        = PETSC_TRUE;
1915   b->nonew              = 0;
1916   b->diag               = NULL;
1917   b->solve_work         = NULL;
1918   B->spptr              = NULL;
1919   b->saved_values       = NULL;
1920   b->idiag              = NULL;
1921   b->mdiag              = NULL;
1922   b->ssor_work          = NULL;
1923   b->omega              = 1.0;
1924   b->fshift             = 0.0;
1925   b->idiagvalid         = PETSC_FALSE;
1926   b->keepnonzeropattern = PETSC_FALSE;
1927 
1928   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
1929   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
1930   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
1931   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
1932   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
1933   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
1934   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
1935   PetscFunctionReturn(0);
1936 }
1937 
1938 /*
1939  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1940  */
1941 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) {
1942   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
1943   PetscInt     i, m                           = A->rmap->n;
1944   PetscInt     totalslices = a->totalslices;
1945 
1946   PetscFunctionBegin;
1947   C->factortype = A->factortype;
1948   c->row        = NULL;
1949   c->col        = NULL;
1950   c->icol       = NULL;
1951   c->reallocs   = 0;
1952   C->assembled  = PETSC_TRUE;
1953 
1954   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
1955   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
1956 
1957   PetscCall(PetscMalloc1(8 * totalslices, &c->rlen));
1958   PetscCall(PetscLogObjectMemory((PetscObject)C, m * sizeof(PetscInt)));
1959   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
1960   PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices + 1) * sizeof(PetscInt)));
1961 
1962   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
1963   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
1964 
1965   /* allocate the matrix space */
1966   if (mallocmatspace) {
1967     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
1968     PetscCall(PetscLogObjectMemory((PetscObject)C, a->maxallocmat * (sizeof(PetscScalar) + sizeof(PetscInt))));
1969 
1970     c->singlemalloc = PETSC_TRUE;
1971 
1972     if (m > 0) {
1973       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
1974       if (cpvalues == MAT_COPY_VALUES) {
1975         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
1976       } else {
1977         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
1978       }
1979     }
1980   }
1981 
1982   c->ignorezeroentries = a->ignorezeroentries;
1983   c->roworiented       = a->roworiented;
1984   c->nonew             = a->nonew;
1985   if (a->diag) {
1986     PetscCall(PetscMalloc1(m, &c->diag));
1987     PetscCall(PetscLogObjectMemory((PetscObject)C, m * sizeof(PetscInt)));
1988     for (i = 0; i < m; i++) { c->diag[i] = a->diag[i]; }
1989   } else c->diag = NULL;
1990 
1991   c->solve_work         = NULL;
1992   c->saved_values       = NULL;
1993   c->idiag              = NULL;
1994   c->ssor_work          = NULL;
1995   c->keepnonzeropattern = a->keepnonzeropattern;
1996   c->free_val           = PETSC_TRUE;
1997   c->free_colidx        = PETSC_TRUE;
1998 
1999   c->maxallocmat  = a->maxallocmat;
2000   c->maxallocrow  = a->maxallocrow;
2001   c->rlenmax      = a->rlenmax;
2002   c->nz           = a->nz;
2003   C->preallocated = PETSC_TRUE;
2004 
2005   c->nonzerorowcnt = a->nonzerorowcnt;
2006   C->nonzerostate  = A->nonzerostate;
2007 
2008   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2009   PetscFunctionReturn(0);
2010 }
2011 
2012 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) {
2013   PetscFunctionBegin;
2014   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2015   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2016   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) { PetscCall(MatSetBlockSizesFromMats(*B, A, A)); }
2017   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2018   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2019   PetscFunctionReturn(0);
2020 }
2021 
2022 /*MC
2023    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2024    based on the sliced Ellpack format
2025 
2026    Options Database Keys:
2027 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions()
2028 
2029    Level: beginner
2030 
2031 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2032 M*/
2033 
2034 /*MC
2035    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2036 
2037    This matrix type is identical to MATSEQSELL when constructed with a single process communicator,
2038    and MATMPISELL otherwise.  As a result, for single process communicators,
2039   MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported
2040   for communicators controlling multiple processes.  It is recommended that you call both of
2041   the above preallocation routines for simplicity.
2042 
2043    Options Database Keys:
2044 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2045 
2046   Level: beginner
2047 
2048   Notes:
2049    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2050 
2051    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2052    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2053 
2054   Developer Notes:
2055    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2056 
2057    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2058 .vb
2059                             (2 0  3 4)
2060    Consider the matrix A =  (5 0  6 0)
2061                             (0 0  7 8)
2062                             (0 0  9 9)
2063 
2064    symbolically the Ellpack format can be written as
2065 
2066         (2 3 4 |)           (0 2 3 |)
2067    v =  (5 6 0 |)  colidx = (0 2 2 |)
2068         --------            ---------
2069         (7 8 |)             (2 3 |)
2070         (9 9 |)             (2 3 |)
2071 
2072     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2073     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2074     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2075 
2076     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2077 
2078 .ve
2079 
2080       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2081 
2082  References:
2083 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2084    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2085 
2086 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2087 M*/
2088 
2089 /*@C
2090        MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2091 
2092  Collective on comm
2093 
2094  Input Parameters:
2095 +  comm - MPI communicator, set to PETSC_COMM_SELF
2096 .  m - number of rows
2097 .  n - number of columns
2098 .  rlenmax - maximum number of nonzeros in a row
2099 -  rlen - array containing the number of nonzeros in the various rows
2100  (possibly different for each row) or NULL
2101 
2102  Output Parameter:
2103 .  A - the matrix
2104 
2105  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2106  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2107  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2108 
2109  Notes:
2110  If nnz is given then nz is ignored
2111 
2112  Specify the preallocated storage with either rlenmax or rlen (not both).
2113  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2114  allocation.  For large problems you MUST preallocate memory or you
2115  will get TERRIBLE performance, see the users' manual chapter on matrices.
2116 
2117  Level: intermediate
2118 
2119  .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2120 
2121  @*/
2122 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A) {
2123   PetscFunctionBegin;
2124   PetscCall(MatCreate(comm, A));
2125   PetscCall(MatSetSizes(*A, m, n, m, n));
2126   PetscCall(MatSetType(*A, MATSEQSELL));
2127   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen));
2128   PetscFunctionReturn(0);
2129 }
2130 
2131 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) {
2132   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2133   PetscInt     totalslices = a->totalslices;
2134 
2135   PetscFunctionBegin;
2136   /* If the  matrix dimensions are not equal,or no of nonzeros */
2137   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2138     *flg = PETSC_FALSE;
2139     PetscFunctionReturn(0);
2140   }
2141   /* if the a->colidx are the same */
2142   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2143   if (!*flg) PetscFunctionReturn(0);
2144   /* if a->val are the same */
2145   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2146   PetscFunctionReturn(0);
2147 }
2148 
2149 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) {
2150   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2151 
2152   PetscFunctionBegin;
2153   a->idiagvalid = PETSC_FALSE;
2154   PetscFunctionReturn(0);
2155 }
2156 
2157 PetscErrorCode MatConjugate_SeqSELL(Mat A) {
2158 #if defined(PETSC_USE_COMPLEX)
2159   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2160   PetscInt     i;
2161   PetscScalar *val = a->val;
2162 
2163   PetscFunctionBegin;
2164   for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); }
2165 #else
2166   PetscFunctionBegin;
2167 #endif
2168   PetscFunctionReturn(0);
2169 }
2170