xref: /petsc/src/mat/impls/sell/seq/sell.c (revision d9acb416d05abeed0a33bde3a81aeb2ea0364f6a)
1 /*
2   Defines the basic matrix operations for the SELL matrix storage format.
3 */
4 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
5 #include <petscblaslapack.h>
6 #include <petsc/private/kernels/blocktranspose.h>
7 
8 static PetscBool  cited      = PETSC_FALSE;
9 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
10                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
11                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
12                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
13                                " year = 2018\n"
14                                "}\n";
15 
16 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
17 
18   #include <immintrin.h>
19 
20   #if !defined(_MM_SCALE_8)
21     #define _MM_SCALE_8 8
22   #endif
23 
24   #if defined(__AVX512F__)
25     /* these do not work
26    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
27    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
28   */
29     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
30       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
31       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
32       vec_vals = _mm512_loadu_pd(aval); \
33       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
34       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
35   #elif defined(__AVX2__) && defined(__FMA__)
36     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
37       vec_vals = _mm256_loadu_pd(aval); \
38       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
39       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
40       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
41   #endif
42 #endif /* PETSC_HAVE_IMMINTRIN_H */
43 
44 /*@C
45   MatSeqSELLSetPreallocation - For good matrix assembly performance
46   the user should preallocate the matrix storage by setting the parameter `nz`
47   (or the array `nnz`).
48 
49   Collective
50 
51   Input Parameters:
52 + B       - The `MATSEQSELL` matrix
53 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
54 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
55 
56   Level: intermediate
57 
58   Notes:
59   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
60   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
61   allocation.
62 
63   You can call `MatGetInfo()` to get information on how effective the preallocation was;
64   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
65   You can also run with the option `-info` and look for messages with the string
66   malloc in them to see if additional memory allocation was needed.
67 
68   Developer Notes:
69   Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
70   entries or columns indices.
71 
72   The maximum number of nonzeos in any row should be as accurate as possible.
73   If it is underestimated, you will get bad performance due to reallocation
74   (`MatSeqXSELLReallocateSELL()`).
75 
76 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
77  @*/
78 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
79 {
80   PetscFunctionBegin;
81   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
82   PetscValidType(B, 1);
83   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
84   PetscFunctionReturn(PETSC_SUCCESS);
85 }
86 
87 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
88 {
89   Mat_SeqSELL *b;
90   PetscInt     i, j, totalslices;
91 #if defined(PETSC_HAVE_CUDA)
92   PetscInt rlenmax = 0;
93 #endif
94   PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
95 
96   PetscFunctionBegin;
97   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
98   if (maxallocrow == MAT_SKIP_ALLOCATION) {
99     skipallocation = PETSC_TRUE;
100     maxallocrow    = 0;
101   }
102 
103   PetscCall(PetscLayoutSetUp(B->rmap));
104   PetscCall(PetscLayoutSetUp(B->cmap));
105 
106   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
107   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
108   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
109   if (rlen) {
110     for (i = 0; i < B->rmap->n; i++) {
111       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
112       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
113     }
114   }
115 
116   B->preallocated = PETSC_TRUE;
117 
118   b = (Mat_SeqSELL *)B->data;
119 
120   if (!b->sliceheight) { /* not set yet */
121 #if defined(PETSC_HAVE_CUDA)
122     b->sliceheight = 16;
123 #else
124     b->sliceheight = 8;
125 #endif
126   }
127   totalslices    = PetscCeilInt(B->rmap->n, b->sliceheight);
128   b->totalslices = totalslices;
129   if (!skipallocation) {
130     if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
131 
132     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
133       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
134     }
135     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
136       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
137       else if (maxallocrow < 0) maxallocrow = 1;
138 #if defined(PETSC_HAVE_CUDA)
139       rlenmax = maxallocrow;
140       /* Pad the slice to DEVICE_MEM_ALIGN */
141       while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
142 #endif
143       for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
144     } else {
145 #if defined(PETSC_HAVE_CUDA)
146       PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
147 #endif
148       maxallocrow  = 0;
149       b->sliidx[0] = 0;
150       for (i = 1; i < totalslices; i++) {
151         b->sliidx[i] = 0;
152         for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); }
153 #if defined(PETSC_HAVE_CUDA)
154         rlenmax = PetscMax(b->sliidx[i], rlenmax);
155         /* Pad the slice to DEVICE_MEM_ALIGN */
156         b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
157 #endif
158         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
159         PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
160       }
161       /* last slice */
162       b->sliidx[totalslices] = 0;
163       for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
164 #if defined(PETSC_HAVE_CUDA)
165       rlenmax                = PetscMax(b->sliidx[i], rlenmax);
166       b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
167 #endif
168       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
169       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
170     }
171 
172     /* allocate space for val, colidx, rlen */
173     /* FIXME: should B's old memory be unlogged? */
174     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
175     /* FIXME: assuming an element of the bit array takes 8 bits */
176     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
177     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
178     PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
179 
180     b->singlemalloc = PETSC_TRUE;
181     b->free_val     = PETSC_TRUE;
182     b->free_colidx  = PETSC_TRUE;
183   } else {
184     b->free_val    = PETSC_FALSE;
185     b->free_colidx = PETSC_FALSE;
186   }
187 
188   b->nz          = 0;
189   b->maxallocrow = maxallocrow;
190 #if defined(PETSC_HAVE_CUDA)
191   b->rlenmax = rlenmax;
192 #else
193   b->rlenmax = maxallocrow;
194 #endif
195   b->maxallocmat      = b->sliidx[totalslices];
196   B->info.nz_unneeded = (double)b->maxallocmat;
197   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
198   PetscFunctionReturn(PETSC_SUCCESS);
199 }
200 
201 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
202 {
203   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
204   PetscInt     shift;
205 
206   PetscFunctionBegin;
207   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
208   if (nz) *nz = a->rlen[row];
209   shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
210   if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); }
211   if (idx) {
212     PetscInt j;
213     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
214     *idx = a->getrowcols;
215   }
216   if (v) {
217     PetscInt j;
218     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
219     *v = a->getrowvals;
220   }
221   PetscFunctionReturn(PETSC_SUCCESS);
222 }
223 
224 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
225 {
226   PetscFunctionBegin;
227   PetscFunctionReturn(PETSC_SUCCESS);
228 }
229 
230 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
231 {
232   Mat          B;
233   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
234   PetscInt     i;
235 
236   PetscFunctionBegin;
237   if (reuse == MAT_REUSE_MATRIX) {
238     B = *newmat;
239     PetscCall(MatZeroEntries(B));
240   } else {
241     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
242     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
243     PetscCall(MatSetType(B, MATSEQAIJ));
244     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
245   }
246 
247   for (i = 0; i < A->rmap->n; i++) {
248     PetscInt     nz = 0, *cols = NULL;
249     PetscScalar *vals = NULL;
250 
251     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
252     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
253     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
254   }
255 
256   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
257   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
258   B->rmap->bs = A->rmap->bs;
259 
260   if (reuse == MAT_INPLACE_MATRIX) {
261     PetscCall(MatHeaderReplace(A, &B));
262   } else {
263     *newmat = B;
264   }
265   PetscFunctionReturn(PETSC_SUCCESS);
266 }
267 
268 #include <../src/mat/impls/aij/seq/aij.h>
269 
270 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
271 {
272   Mat                B;
273   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
274   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
275   const PetscInt    *cols;
276   const PetscScalar *vals;
277 
278   PetscFunctionBegin;
279 
280   if (reuse == MAT_REUSE_MATRIX) {
281     B = *newmat;
282   } else {
283     if (PetscDefined(USE_DEBUG) || !a->ilen) {
284       PetscCall(PetscMalloc1(m, &rowlengths));
285       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
286     }
287     if (PetscDefined(USE_DEBUG) && a->ilen) {
288       PetscBool eq;
289       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
290       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
291       PetscCall(PetscFree(rowlengths));
292       rowlengths = a->ilen;
293     } else if (a->ilen) rowlengths = a->ilen;
294     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
295     PetscCall(MatSetSizes(B, m, n, m, n));
296     PetscCall(MatSetType(B, MATSEQSELL));
297     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
298     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
299   }
300 
301   for (row = 0; row < m; row++) {
302     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
303     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
304     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
305   }
306   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
307   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
308   B->rmap->bs = A->rmap->bs;
309 
310   if (reuse == MAT_INPLACE_MATRIX) {
311     PetscCall(MatHeaderReplace(A, &B));
312   } else {
313     *newmat = B;
314   }
315   PetscFunctionReturn(PETSC_SUCCESS);
316 }
317 
318 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
319 {
320   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
321   PetscScalar       *y;
322   const PetscScalar *x;
323   const MatScalar   *aval        = a->val;
324   PetscInt           totalslices = a->totalslices;
325   const PetscInt    *acolidx     = a->colidx;
326   PetscInt           i, j;
327 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
328   __m512d  vec_x, vec_y, vec_vals;
329   __m256i  vec_idx;
330   __mmask8 mask;
331   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
332   __m256i  vec_idx2, vec_idx3, vec_idx4;
333 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
334   __m128i   vec_idx;
335   __m256d   vec_x, vec_y, vec_y2, vec_vals;
336   MatScalar yval;
337   PetscInt  r, rows_left, row, nnz_in_row;
338 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
339   __m128d   vec_x_tmp;
340   __m256d   vec_x, vec_y, vec_y2, vec_vals;
341   MatScalar yval;
342   PetscInt  r, rows_left, row, nnz_in_row;
343 #else
344   PetscInt     k, sliceheight = a->sliceheight;
345   PetscScalar *sum;
346 #endif
347 
348 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
349   #pragma disjoint(*x, *y, *aval)
350 #endif
351 
352   PetscFunctionBegin;
353   PetscCall(VecGetArrayRead(xx, &x));
354   PetscCall(VecGetArray(yy, &y));
355 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
356   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
357   for (i = 0; i < totalslices; i++) { /* loop over slices */
358     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
359     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
360 
361     vec_y  = _mm512_setzero_pd();
362     vec_y2 = _mm512_setzero_pd();
363     vec_y3 = _mm512_setzero_pd();
364     vec_y4 = _mm512_setzero_pd();
365 
366     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
367     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
368     case 3:
369       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
370       acolidx += 8;
371       aval += 8;
372       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
373       acolidx += 8;
374       aval += 8;
375       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
376       acolidx += 8;
377       aval += 8;
378       j += 3;
379       break;
380     case 2:
381       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
382       acolidx += 8;
383       aval += 8;
384       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
385       acolidx += 8;
386       aval += 8;
387       j += 2;
388       break;
389     case 1:
390       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
391       acolidx += 8;
392       aval += 8;
393       j += 1;
394       break;
395     }
396   #pragma novector
397     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
398       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
399       acolidx += 8;
400       aval += 8;
401       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
402       acolidx += 8;
403       aval += 8;
404       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
405       acolidx += 8;
406       aval += 8;
407       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
408       acolidx += 8;
409       aval += 8;
410     }
411 
412     vec_y = _mm512_add_pd(vec_y, vec_y2);
413     vec_y = _mm512_add_pd(vec_y, vec_y3);
414     vec_y = _mm512_add_pd(vec_y, vec_y4);
415     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
416       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
417       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
418     } else {
419       _mm512_storeu_pd(&y[8 * i], vec_y);
420     }
421   }
422 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
423   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
424   for (i = 0; i < totalslices; i++) { /* loop over full slices */
425     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
426     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
427 
428     /* last slice may have padding rows. Don't use vectorization. */
429     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
430       rows_left = A->rmap->n - 8 * i;
431       for (r = 0; r < rows_left; ++r) {
432         yval       = (MatScalar)0;
433         row        = 8 * i + r;
434         nnz_in_row = a->rlen[row];
435         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
436         y[row] = yval;
437       }
438       break;
439     }
440 
441     vec_y  = _mm256_setzero_pd();
442     vec_y2 = _mm256_setzero_pd();
443 
444   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
445   #pragma novector
446   #pragma unroll(2)
447     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
448       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
449       aval += 4;
450       acolidx += 4;
451       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
452       aval += 4;
453       acolidx += 4;
454     }
455 
456     _mm256_storeu_pd(y + i * 8, vec_y);
457     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
458   }
459 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
460   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
461   for (i = 0; i < totalslices; i++) { /* loop over full slices */
462     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
463     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
464 
465     vec_y  = _mm256_setzero_pd();
466     vec_y2 = _mm256_setzero_pd();
467 
468     /* last slice may have padding rows. Don't use vectorization. */
469     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
470       rows_left = A->rmap->n - 8 * i;
471       for (r = 0; r < rows_left; ++r) {
472         yval       = (MatScalar)0;
473         row        = 8 * i + r;
474         nnz_in_row = a->rlen[row];
475         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
476         y[row] = yval;
477       }
478       break;
479     }
480 
481   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
482   #pragma novector
483   #pragma unroll(2)
484     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
485       vec_vals  = _mm256_loadu_pd(aval);
486       vec_x_tmp = _mm_setzero_pd();
487       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
488       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
489       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
490       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
491       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
492       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
493       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
494       aval += 4;
495 
496       vec_vals  = _mm256_loadu_pd(aval);
497       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
498       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
499       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
500       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
501       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
502       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
503       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
504       aval += 4;
505     }
506 
507     _mm256_storeu_pd(y + i * 8, vec_y);
508     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
509   }
510 #else
511   PetscCall(PetscMalloc1(sliceheight, &sum));
512   for (i = 0; i < totalslices; i++) { /* loop over slices */
513     for (j = 0; j < sliceheight; j++) {
514       sum[j] = 0.0;
515       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
516     }
517     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
518       for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
519     } else {
520       for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
521     }
522   }
523   PetscCall(PetscFree(sum));
524 #endif
525 
526   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
527   PetscCall(VecRestoreArrayRead(xx, &x));
528   PetscCall(VecRestoreArray(yy, &y));
529   PetscFunctionReturn(PETSC_SUCCESS);
530 }
531 
532 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
533 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
534 {
535   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
536   PetscScalar       *y, *z;
537   const PetscScalar *x;
538   const MatScalar   *aval        = a->val;
539   PetscInt           totalslices = a->totalslices;
540   const PetscInt    *acolidx     = a->colidx;
541   PetscInt           i, j;
542 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
543   __m512d  vec_x, vec_y, vec_vals;
544   __m256i  vec_idx;
545   __mmask8 mask = 0;
546   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
547   __m256i  vec_idx2, vec_idx3, vec_idx4;
548 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
549   __m128d   vec_x_tmp;
550   __m256d   vec_x, vec_y, vec_y2, vec_vals;
551   MatScalar yval;
552   PetscInt  r, row, nnz_in_row;
553 #else
554   PetscInt     k, sliceheight = a->sliceheight;
555   PetscScalar *sum;
556 #endif
557 
558 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
559   #pragma disjoint(*x, *y, *aval)
560 #endif
561 
562   PetscFunctionBegin;
563   if (!a->nz) {
564     PetscCall(VecCopy(yy, zz));
565     PetscFunctionReturn(PETSC_SUCCESS);
566   }
567   PetscCall(VecGetArrayRead(xx, &x));
568   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
569 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
570   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
571   for (i = 0; i < totalslices; i++) { /* loop over slices */
572     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
573     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
574 
575     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
576       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
577       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
578     } else {
579       vec_y = _mm512_loadu_pd(&y[8 * i]);
580     }
581     vec_y2 = _mm512_setzero_pd();
582     vec_y3 = _mm512_setzero_pd();
583     vec_y4 = _mm512_setzero_pd();
584 
585     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
586     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
587     case 3:
588       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
589       acolidx += 8;
590       aval += 8;
591       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
592       acolidx += 8;
593       aval += 8;
594       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
595       acolidx += 8;
596       aval += 8;
597       j += 3;
598       break;
599     case 2:
600       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
601       acolidx += 8;
602       aval += 8;
603       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
604       acolidx += 8;
605       aval += 8;
606       j += 2;
607       break;
608     case 1:
609       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
610       acolidx += 8;
611       aval += 8;
612       j += 1;
613       break;
614     }
615   #pragma novector
616     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
617       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
618       acolidx += 8;
619       aval += 8;
620       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
621       acolidx += 8;
622       aval += 8;
623       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
624       acolidx += 8;
625       aval += 8;
626       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
627       acolidx += 8;
628       aval += 8;
629     }
630 
631     vec_y = _mm512_add_pd(vec_y, vec_y2);
632     vec_y = _mm512_add_pd(vec_y, vec_y3);
633     vec_y = _mm512_add_pd(vec_y, vec_y4);
634     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
635       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
636     } else {
637       _mm512_storeu_pd(&z[8 * i], vec_y);
638     }
639   }
640 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
641   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
642   for (i = 0; i < totalslices; i++) { /* loop over full slices */
643     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
644     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
645 
646     /* last slice may have padding rows. Don't use vectorization. */
647     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
648       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
649         row        = 8 * i + r;
650         yval       = (MatScalar)0.0;
651         nnz_in_row = a->rlen[row];
652         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
653         z[row] = y[row] + yval;
654       }
655       break;
656     }
657 
658     vec_y  = _mm256_loadu_pd(y + 8 * i);
659     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
660 
661     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
662     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
663       vec_vals  = _mm256_loadu_pd(aval);
664       vec_x_tmp = _mm_setzero_pd();
665       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
666       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
667       vec_x     = _mm256_setzero_pd();
668       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
669       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
670       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
671       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
672       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
673       aval += 4;
674 
675       vec_vals  = _mm256_loadu_pd(aval);
676       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
677       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
678       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
679       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
680       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
681       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
682       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
683       aval += 4;
684     }
685 
686     _mm256_storeu_pd(z + i * 8, vec_y);
687     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
688   }
689 #else
690   PetscCall(PetscMalloc1(sliceheight, &sum));
691   for (i = 0; i < totalslices; i++) { /* loop over slices */
692     for (j = 0; j < sliceheight; j++) {
693       sum[j] = 0.0;
694       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
695     }
696     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
697       for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
698     } else {
699       for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
700     }
701   }
702   PetscCall(PetscFree(sum));
703 #endif
704 
705   PetscCall(PetscLogFlops(2.0 * a->nz));
706   PetscCall(VecRestoreArrayRead(xx, &x));
707   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
708   PetscFunctionReturn(PETSC_SUCCESS);
709 }
710 
711 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
712 {
713   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
714   PetscScalar       *y;
715   const PetscScalar *x;
716   const MatScalar   *aval    = a->val;
717   const PetscInt    *acolidx = a->colidx;
718   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
719 
720 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
721   #pragma disjoint(*x, *y, *aval)
722 #endif
723 
724   PetscFunctionBegin;
725   if (A->symmetric == PETSC_BOOL3_TRUE) {
726     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
727     PetscFunctionReturn(PETSC_SUCCESS);
728   }
729   if (zz != yy) PetscCall(VecCopy(zz, yy));
730 
731   if (a->nz) {
732     PetscCall(VecGetArrayRead(xx, &x));
733     PetscCall(VecGetArray(yy, &y));
734     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
735       if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
736         for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
737           row        = sliceheight * i + r;
738           nnz_in_row = a->rlen[row];
739           for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
740         }
741         break;
742       }
743       for (r = 0; r < sliceheight; ++r)
744         for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
745     }
746     PetscCall(PetscLogFlops(2.0 * a->nz));
747     PetscCall(VecRestoreArrayRead(xx, &x));
748     PetscCall(VecRestoreArray(yy, &y));
749   }
750   PetscFunctionReturn(PETSC_SUCCESS);
751 }
752 
753 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
754 {
755   PetscFunctionBegin;
756   if (A->symmetric == PETSC_BOOL3_TRUE) {
757     PetscCall(MatMult_SeqSELL(A, xx, yy));
758   } else {
759     PetscCall(VecSet(yy, 0.0));
760     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
761   }
762   PetscFunctionReturn(PETSC_SUCCESS);
763 }
764 
765 /*
766      Checks for missing diagonals
767 */
768 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
769 {
770   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
771   PetscInt    *diag, i;
772 
773   PetscFunctionBegin;
774   *missing = PETSC_FALSE;
775   if (A->rmap->n > 0 && !(a->colidx)) {
776     *missing = PETSC_TRUE;
777     if (d) *d = 0;
778     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
779   } else {
780     diag = a->diag;
781     for (i = 0; i < A->rmap->n; i++) {
782       if (diag[i] == -1) {
783         *missing = PETSC_TRUE;
784         if (d) *d = i;
785         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
786         break;
787       }
788     }
789   }
790   PetscFunctionReturn(PETSC_SUCCESS);
791 }
792 
793 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
794 {
795   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
796   PetscInt     i, j, m = A->rmap->n, shift;
797 
798   PetscFunctionBegin;
799   if (!a->diag) {
800     PetscCall(PetscMalloc1(m, &a->diag));
801     a->free_diag = PETSC_TRUE;
802   }
803   for (i = 0; i < m; i++) {                                          /* loop over rows */
804     shift      = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
805     a->diag[i] = -1;
806     for (j = 0; j < a->rlen[i]; j++) {
807       if (a->colidx[shift + a->sliceheight * j] == i) {
808         a->diag[i] = shift + a->sliceheight * j;
809         break;
810       }
811     }
812   }
813   PetscFunctionReturn(PETSC_SUCCESS);
814 }
815 
816 /*
817   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
818 */
819 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
820 {
821   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
822   PetscInt     i, *diag, m = A->rmap->n;
823   MatScalar   *val = a->val;
824   PetscScalar *idiag, *mdiag;
825 
826   PetscFunctionBegin;
827   if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
828   PetscCall(MatMarkDiagonal_SeqSELL(A));
829   diag = a->diag;
830   if (!a->idiag) {
831     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
832     val = a->val;
833   }
834   mdiag = a->mdiag;
835   idiag = a->idiag;
836 
837   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
838     for (i = 0; i < m; i++) {
839       mdiag[i] = val[diag[i]];
840       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
841         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
842         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
843         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
844         A->factorerror_zeropivot_value = 0.0;
845         A->factorerror_zeropivot_row   = i;
846       }
847       idiag[i] = 1.0 / val[diag[i]];
848     }
849     PetscCall(PetscLogFlops(m));
850   } else {
851     for (i = 0; i < m; i++) {
852       mdiag[i] = val[diag[i]];
853       idiag[i] = omega / (fshift + val[diag[i]]);
854     }
855     PetscCall(PetscLogFlops(2.0 * m));
856   }
857   a->idiagvalid = PETSC_TRUE;
858   PetscFunctionReturn(PETSC_SUCCESS);
859 }
860 
861 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
862 {
863   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
864 
865   PetscFunctionBegin;
866   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
867   PetscCall(MatSeqSELLInvalidateDiagonal(A));
868   PetscFunctionReturn(PETSC_SUCCESS);
869 }
870 
871 PetscErrorCode MatDestroy_SeqSELL(Mat A)
872 {
873   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
874 
875   PetscFunctionBegin;
876   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
877   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
878   PetscCall(ISDestroy(&a->row));
879   PetscCall(ISDestroy(&a->col));
880   PetscCall(PetscFree(a->diag));
881   PetscCall(PetscFree(a->rlen));
882   PetscCall(PetscFree(a->sliidx));
883   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
884   PetscCall(PetscFree(a->solve_work));
885   PetscCall(ISDestroy(&a->icol));
886   PetscCall(PetscFree(a->saved_values));
887   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
888   PetscCall(PetscFree(A->data));
889 #if defined(PETSC_HAVE_CUDA)
890   PetscCall(PetscFree(a->chunk_slice_map));
891 #endif
892 
893   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
894   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
895   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
896   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
897   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
898   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
899   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
900 #if defined(PETSC_HAVE_CUDA)
901   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL));
902 #endif
903   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
904   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
905   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
906   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL));
907   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
908   PetscFunctionReturn(PETSC_SUCCESS);
909 }
910 
911 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
912 {
913   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
914 
915   PetscFunctionBegin;
916   switch (op) {
917   case MAT_ROW_ORIENTED:
918     a->roworiented = flg;
919     break;
920   case MAT_KEEP_NONZERO_PATTERN:
921     a->keepnonzeropattern = flg;
922     break;
923   case MAT_NEW_NONZERO_LOCATIONS:
924     a->nonew = (flg ? 0 : 1);
925     break;
926   case MAT_NEW_NONZERO_LOCATION_ERR:
927     a->nonew = (flg ? -1 : 0);
928     break;
929   case MAT_NEW_NONZERO_ALLOCATION_ERR:
930     a->nonew = (flg ? -2 : 0);
931     break;
932   case MAT_UNUSED_NONZERO_LOCATION_ERR:
933     a->nounused = (flg ? -1 : 0);
934     break;
935   case MAT_FORCE_DIAGONAL_ENTRIES:
936   case MAT_IGNORE_OFF_PROC_ENTRIES:
937   case MAT_USE_HASH_TABLE:
938   case MAT_SORTED_FULL:
939     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
940     break;
941   case MAT_SPD:
942   case MAT_SYMMETRIC:
943   case MAT_STRUCTURALLY_SYMMETRIC:
944   case MAT_HERMITIAN:
945   case MAT_SYMMETRY_ETERNAL:
946   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
947   case MAT_SPD_ETERNAL:
948     /* These options are handled directly by MatSetOption() */
949     break;
950   default:
951     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
952   }
953   PetscFunctionReturn(PETSC_SUCCESS);
954 }
955 
956 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
957 {
958   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
959   PetscInt     i, j, n, shift;
960   PetscScalar *x, zero = 0.0;
961 
962   PetscFunctionBegin;
963   PetscCall(VecGetLocalSize(v, &n));
964   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
965 
966   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
967     PetscInt *diag = a->diag;
968     PetscCall(VecGetArray(v, &x));
969     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
970     PetscCall(VecRestoreArray(v, &x));
971     PetscFunctionReturn(PETSC_SUCCESS);
972   }
973 
974   PetscCall(VecSet(v, zero));
975   PetscCall(VecGetArray(v, &x));
976   for (i = 0; i < n; i++) {                                     /* loop over rows */
977     shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
978     x[i]  = 0;
979     for (j = 0; j < a->rlen[i]; j++) {
980       if (a->colidx[shift + a->sliceheight * j] == i) {
981         x[i] = a->val[shift + a->sliceheight * j];
982         break;
983       }
984     }
985   }
986   PetscCall(VecRestoreArray(v, &x));
987   PetscFunctionReturn(PETSC_SUCCESS);
988 }
989 
990 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
991 {
992   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
993   const PetscScalar *l, *r;
994   PetscInt           i, j, m, n, row;
995 
996   PetscFunctionBegin;
997   if (ll) {
998     /* The local size is used so that VecMPI can be passed to this routine
999        by MatDiagonalScale_MPISELL */
1000     PetscCall(VecGetLocalSize(ll, &m));
1001     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1002     PetscCall(VecGetArrayRead(ll, &l));
1003     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1004       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1005         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1006           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
1007         }
1008       } else {
1009         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; }
1010       }
1011     }
1012     PetscCall(VecRestoreArrayRead(ll, &l));
1013     PetscCall(PetscLogFlops(a->nz));
1014   }
1015   if (rr) {
1016     PetscCall(VecGetLocalSize(rr, &n));
1017     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1018     PetscCall(VecGetArrayRead(rr, &r));
1019     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1020       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1021         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1022           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1023         }
1024       } else {
1025         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1026       }
1027     }
1028     PetscCall(VecRestoreArrayRead(rr, &r));
1029     PetscCall(PetscLogFlops(a->nz));
1030   }
1031   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1032 #if defined(PETSC_HAVE_CUDA)
1033   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1034 #endif
1035   PetscFunctionReturn(PETSC_SUCCESS);
1036 }
1037 
1038 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1039 {
1040   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1041   PetscInt    *cp, i, k, low, high, t, row, col, l;
1042   PetscInt     shift;
1043   MatScalar   *vp;
1044 
1045   PetscFunctionBegin;
1046   for (k = 0; k < m; k++) { /* loop over requested rows */
1047     row = im[k];
1048     if (row < 0) continue;
1049     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1050     shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1051     cp    = a->colidx + shift;                                        /* pointer to the row */
1052     vp    = a->val + shift;                                           /* pointer to the row */
1053     for (l = 0; l < n; l++) {                                         /* loop over requested columns */
1054       col = in[l];
1055       if (col < 0) continue;
1056       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1057       high = a->rlen[row];
1058       low  = 0; /* assume unsorted */
1059       while (high - low > 5) {
1060         t = (low + high) / 2;
1061         if (*(cp + a->sliceheight * t) > col) high = t;
1062         else low = t;
1063       }
1064       for (i = low; i < high; i++) {
1065         if (*(cp + a->sliceheight * i) > col) break;
1066         if (*(cp + a->sliceheight * i) == col) {
1067           *v++ = *(vp + a->sliceheight * i);
1068           goto finished;
1069         }
1070       }
1071       *v++ = 0.0;
1072     finished:;
1073     }
1074   }
1075   PetscFunctionReturn(PETSC_SUCCESS);
1076 }
1077 
1078 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1079 {
1080   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1081   PetscInt          i, j, m = A->rmap->n, shift;
1082   const char       *name;
1083   PetscViewerFormat format;
1084 
1085   PetscFunctionBegin;
1086   PetscCall(PetscViewerGetFormat(viewer, &format));
1087   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1088     PetscInt nofinalvalue = 0;
1089     /*
1090     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1091       nofinalvalue = 1;
1092     }
1093     */
1094     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1095     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1096     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1097 #if defined(PETSC_USE_COMPLEX)
1098     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1099 #else
1100     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1101 #endif
1102     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1103 
1104     for (i = 0; i < m; i++) {
1105       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1106       for (j = 0; j < a->rlen[i]; j++) {
1107 #if defined(PETSC_USE_COMPLEX)
1108         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1109 #else
1110         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1111 #endif
1112       }
1113     }
1114     /*
1115     if (nofinalvalue) {
1116 #if defined(PETSC_USE_COMPLEX)
1117       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1118 #else
1119       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1120 #endif
1121     }
1122     */
1123     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1124     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1125     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1126   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1127     PetscFunctionReturn(PETSC_SUCCESS);
1128   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1129     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1130     for (i = 0; i < m; i++) {
1131       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1132       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1133       for (j = 0; j < a->rlen[i]; j++) {
1134 #if defined(PETSC_USE_COMPLEX)
1135         if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1136           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1137         } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1138           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1139         } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1140           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1141         }
1142 #else
1143         if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1144 #endif
1145       }
1146       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1147     }
1148     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1149   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1150     PetscInt    cnt = 0, jcnt;
1151     PetscScalar value;
1152 #if defined(PETSC_USE_COMPLEX)
1153     PetscBool realonly = PETSC_TRUE;
1154     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1155       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1156         realonly = PETSC_FALSE;
1157         break;
1158       }
1159     }
1160 #endif
1161 
1162     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1163     for (i = 0; i < m; i++) {
1164       jcnt  = 0;
1165       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1166       for (j = 0; j < A->cmap->n; j++) {
1167         if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1168           value = a->val[cnt++];
1169           jcnt++;
1170         } else {
1171           value = 0.0;
1172         }
1173 #if defined(PETSC_USE_COMPLEX)
1174         if (realonly) {
1175           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1176         } else {
1177           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1178         }
1179 #else
1180         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1181 #endif
1182       }
1183       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1184     }
1185     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1186   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1187     PetscInt fshift = 1;
1188     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1189 #if defined(PETSC_USE_COMPLEX)
1190     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1191 #else
1192     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1193 #endif
1194     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1195     for (i = 0; i < m; i++) {
1196       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1197       for (j = 0; j < a->rlen[i]; j++) {
1198 #if defined(PETSC_USE_COMPLEX)
1199         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1200 #else
1201         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1202 #endif
1203       }
1204     }
1205     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1206   } else if (format == PETSC_VIEWER_NATIVE) {
1207     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1208       PetscInt row;
1209       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1210       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1211 #if defined(PETSC_USE_COMPLEX)
1212         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1213           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1214         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1215           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1216         } else {
1217           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1218         }
1219 #else
1220         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1221 #endif
1222       }
1223     }
1224   } else {
1225     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1226     if (A->factortype) {
1227       for (i = 0; i < m; i++) {
1228         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1229         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1230         /* L part */
1231         for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1232 #if defined(PETSC_USE_COMPLEX)
1233           if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1234             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1235           } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1236             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1237           } else {
1238             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1239           }
1240 #else
1241           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1242 #endif
1243         }
1244         /* diagonal */
1245         j = a->diag[i];
1246 #if defined(PETSC_USE_COMPLEX)
1247         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1248           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1249         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1250           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1251         } else {
1252           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1253         }
1254 #else
1255         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1256 #endif
1257 
1258         /* U part */
1259         for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1260 #if defined(PETSC_USE_COMPLEX)
1261           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1262             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1263           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1264             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1265           } else {
1266             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1267           }
1268 #else
1269           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1270 #endif
1271         }
1272         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1273       }
1274     } else {
1275       for (i = 0; i < m; i++) {
1276         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1277         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1278         for (j = 0; j < a->rlen[i]; j++) {
1279 #if defined(PETSC_USE_COMPLEX)
1280           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1281             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1282           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1283             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1284           } else {
1285             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1286           }
1287 #else
1288           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1289 #endif
1290         }
1291         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1292       }
1293     }
1294     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1295   }
1296   PetscCall(PetscViewerFlush(viewer));
1297   PetscFunctionReturn(PETSC_SUCCESS);
1298 }
1299 
1300 #include <petscdraw.h>
1301 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1302 {
1303   Mat               A = (Mat)Aa;
1304   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1305   PetscInt          i, j, m = A->rmap->n, shift;
1306   int               color;
1307   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1308   PetscViewer       viewer;
1309   PetscViewerFormat format;
1310 
1311   PetscFunctionBegin;
1312   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1313   PetscCall(PetscViewerGetFormat(viewer, &format));
1314   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1315 
1316   /* loop over matrix elements drawing boxes */
1317 
1318   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1319     PetscDrawCollectiveBegin(draw);
1320     /* Blue for negative, Cyan for zero and  Red for positive */
1321     color = PETSC_DRAW_BLUE;
1322     for (i = 0; i < m; i++) {
1323       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1324       y_l   = m - i - 1.0;
1325       y_r   = y_l + 1.0;
1326       for (j = 0; j < a->rlen[i]; j++) {
1327         x_l = a->colidx[shift + a->sliceheight * j];
1328         x_r = x_l + 1.0;
1329         if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1330         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1331       }
1332     }
1333     color = PETSC_DRAW_CYAN;
1334     for (i = 0; i < m; i++) {
1335       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1336       y_l   = m - i - 1.0;
1337       y_r   = y_l + 1.0;
1338       for (j = 0; j < a->rlen[i]; j++) {
1339         x_l = a->colidx[shift + a->sliceheight * j];
1340         x_r = x_l + 1.0;
1341         if (a->val[shift + a->sliceheight * j] != 0.) continue;
1342         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1343       }
1344     }
1345     color = PETSC_DRAW_RED;
1346     for (i = 0; i < m; i++) {
1347       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1348       y_l   = m - i - 1.0;
1349       y_r   = y_l + 1.0;
1350       for (j = 0; j < a->rlen[i]; j++) {
1351         x_l = a->colidx[shift + a->sliceheight * j];
1352         x_r = x_l + 1.0;
1353         if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1354         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1355       }
1356     }
1357     PetscDrawCollectiveEnd(draw);
1358   } else {
1359     /* use contour shading to indicate magnitude of values */
1360     /* first determine max of all nonzero values */
1361     PetscReal minv = 0.0, maxv = 0.0;
1362     PetscInt  count = 0;
1363     PetscDraw popup;
1364     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1365       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1366     }
1367     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1368     PetscCall(PetscDrawGetPopup(draw, &popup));
1369     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1370 
1371     PetscDrawCollectiveBegin(draw);
1372     for (i = 0; i < m; i++) {
1373       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1374       y_l   = m - i - 1.0;
1375       y_r   = y_l + 1.0;
1376       for (j = 0; j < a->rlen[i]; j++) {
1377         x_l   = a->colidx[shift + a->sliceheight * j];
1378         x_r   = x_l + 1.0;
1379         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1380         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1381         count++;
1382       }
1383     }
1384     PetscDrawCollectiveEnd(draw);
1385   }
1386   PetscFunctionReturn(PETSC_SUCCESS);
1387 }
1388 
1389 #include <petscdraw.h>
1390 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1391 {
1392   PetscDraw draw;
1393   PetscReal xr, yr, xl, yl, h, w;
1394   PetscBool isnull;
1395 
1396   PetscFunctionBegin;
1397   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1398   PetscCall(PetscDrawIsNull(draw, &isnull));
1399   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1400 
1401   xr = A->cmap->n;
1402   yr = A->rmap->n;
1403   h  = yr / 10.0;
1404   w  = xr / 10.0;
1405   xr += w;
1406   yr += h;
1407   xl = -w;
1408   yl = -h;
1409   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1410   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1411   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1412   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1413   PetscCall(PetscDrawSave(draw));
1414   PetscFunctionReturn(PETSC_SUCCESS);
1415 }
1416 
1417 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1418 {
1419   PetscBool iascii, isbinary, isdraw;
1420 
1421   PetscFunctionBegin;
1422   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1423   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1424   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1425   if (iascii) {
1426     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1427   } else if (isbinary) {
1428     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1429   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1430   PetscFunctionReturn(PETSC_SUCCESS);
1431 }
1432 
1433 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1434 {
1435   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1436   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1437   MatScalar   *vp;
1438 #if defined(PETSC_HAVE_CUDA)
1439   PetscInt totalchunks = 0;
1440 #endif
1441 
1442   PetscFunctionBegin;
1443   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1444   /* To do: compress out the unused elements */
1445   PetscCall(MatMarkDiagonal_SeqSELL(A));
1446   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1447   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1448   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1449   a->nonzerorowcnt = 0;
1450   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1451   for (i = 0; i < a->totalslices; ++i) {
1452     shift = a->sliidx[i];                                                   /* starting index of the slice */
1453     cp    = PetscSafePointerPlusOffset(a->colidx, shift);                   /* pointer to the column indices of the slice */
1454     vp    = PetscSafePointerPlusOffset(a->val, shift);                      /* pointer to the nonzero values of the slice */
1455     for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1456       row  = a->sliceheight * i + row_in_slice;
1457       nrow = a->rlen[row]; /* number of nonzeros in row */
1458       /*
1459         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1460         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1461       */
1462       lastcol = 0;
1463       if (nrow > 0) { /* nonempty row */
1464         a->nonzerorowcnt++;
1465         lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1466       } else if (!row_in_slice) {                                 /* first row of the correct slice is empty */
1467         for (j = 1; j < a->sliceheight; j++) {
1468           if (a->rlen[a->sliceheight * i + j]) {
1469             lastcol = cp[j];
1470             break;
1471           }
1472         }
1473       } else {
1474         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1475       }
1476 
1477       for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1478         cp[a->sliceheight * k + row_in_slice] = lastcol;
1479         vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1480       }
1481     }
1482   }
1483 
1484   A->info.mallocs += a->reallocs;
1485   a->reallocs = 0;
1486 
1487   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1488 #if defined(PETSC_HAVE_CUDA)
1489   if (!a->chunksize && a->totalslices) {
1490     a->chunksize = 64;
1491     while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2;
1492     totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize;
1493   }
1494   if (totalchunks != a->totalchunks) {
1495     PetscCall(PetscFree(a->chunk_slice_map));
1496     PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map));
1497     a->totalchunks = totalchunks;
1498   }
1499   j = 0;
1500   for (i = 0; i < totalchunks; i++) {
1501     while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++;
1502     a->chunk_slice_map[i] = j;
1503   }
1504 #endif
1505   PetscFunctionReturn(PETSC_SUCCESS);
1506 }
1507 
1508 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1509 {
1510   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1511 
1512   PetscFunctionBegin;
1513   info->block_size   = 1.0;
1514   info->nz_allocated = a->maxallocmat;
1515   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1516   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1517   info->assemblies   = A->num_ass;
1518   info->mallocs      = A->info.mallocs;
1519   info->memory       = 0; /* REVIEW ME */
1520   if (A->factortype) {
1521     info->fill_ratio_given  = A->info.fill_ratio_given;
1522     info->fill_ratio_needed = A->info.fill_ratio_needed;
1523     info->factor_mallocs    = A->info.factor_mallocs;
1524   } else {
1525     info->fill_ratio_given  = 0;
1526     info->fill_ratio_needed = 0;
1527     info->factor_mallocs    = 0;
1528   }
1529   PetscFunctionReturn(PETSC_SUCCESS);
1530 }
1531 
1532 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1533 {
1534   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1535   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1536   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1537   MatScalar   *vp, value;
1538 #if defined(PETSC_HAVE_CUDA)
1539   PetscBool inserted = PETSC_FALSE;
1540   PetscInt  mul      = DEVICE_MEM_ALIGN / a->sliceheight;
1541 #endif
1542 
1543   PetscFunctionBegin;
1544   for (k = 0; k < m; k++) { /* loop over added rows */
1545     row = im[k];
1546     if (row < 0) continue;
1547     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1548     shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1549     cp    = a->colidx + shift;                                      /* pointer to the row */
1550     vp    = a->val + shift;                                         /* pointer to the row */
1551     nrow  = a->rlen[row];
1552     low   = 0;
1553     high  = nrow;
1554 
1555     for (l = 0; l < n; l++) { /* loop over added columns */
1556       col = in[l];
1557       if (col < 0) continue;
1558       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1559       if (a->roworiented) {
1560         value = v[l + k * n];
1561       } else {
1562         value = v[k + l * m];
1563       }
1564       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1565 
1566       /* search in this row for the specified column, i indicates the column to be set */
1567       if (col <= lastcol) low = 0;
1568       else high = nrow;
1569       lastcol = col;
1570       while (high - low > 5) {
1571         t = (low + high) / 2;
1572         if (*(cp + a->sliceheight * t) > col) high = t;
1573         else low = t;
1574       }
1575       for (i = low; i < high; i++) {
1576         if (*(cp + a->sliceheight * i) > col) break;
1577         if (*(cp + a->sliceheight * i) == col) {
1578           if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1579           else *(vp + a->sliceheight * i) = value;
1580 #if defined(PETSC_HAVE_CUDA)
1581           inserted = PETSC_TRUE;
1582 #endif
1583           low = i + 1;
1584           goto noinsert;
1585         }
1586       }
1587       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1588       if (nonew == 1) goto noinsert;
1589       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1590 #if defined(PETSC_HAVE_CUDA)
1591       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1592 #else
1593       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1594       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1595 #endif
1596       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1597       for (ii = nrow - 1; ii >= i; ii--) {
1598         *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1599         *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1600       }
1601       a->rlen[row]++;
1602       *(cp + a->sliceheight * i) = col;
1603       *(vp + a->sliceheight * i) = value;
1604       a->nz++;
1605       A->nonzerostate++;
1606 #if defined(PETSC_HAVE_CUDA)
1607       inserted = PETSC_TRUE;
1608 #endif
1609       low = i + 1;
1610       high++;
1611       nrow++;
1612     noinsert:;
1613     }
1614     a->rlen[row] = nrow;
1615   }
1616 #if defined(PETSC_HAVE_CUDA)
1617   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1618 #endif
1619   PetscFunctionReturn(PETSC_SUCCESS);
1620 }
1621 
1622 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1623 {
1624   PetscFunctionBegin;
1625   /* If the two matrices have the same copy implementation, use fast copy. */
1626   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1627     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1628     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1629 
1630     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1631     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1632   } else {
1633     PetscCall(MatCopy_Basic(A, B, str));
1634   }
1635   PetscFunctionReturn(PETSC_SUCCESS);
1636 }
1637 
1638 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1639 {
1640   PetscFunctionBegin;
1641   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1642   PetscFunctionReturn(PETSC_SUCCESS);
1643 }
1644 
1645 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1646 {
1647   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1648 
1649   PetscFunctionBegin;
1650   *array = a->val;
1651   PetscFunctionReturn(PETSC_SUCCESS);
1652 }
1653 
1654 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1655 {
1656   PetscFunctionBegin;
1657   PetscFunctionReturn(PETSC_SUCCESS);
1658 }
1659 
1660 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1661 {
1662   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1663   MatScalar   *aval   = a->val;
1664   PetscScalar  oalpha = alpha;
1665   PetscBLASInt one    = 1, size;
1666 
1667   PetscFunctionBegin;
1668   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1669   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1670   PetscCall(PetscLogFlops(a->nz));
1671   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1672 #if defined(PETSC_HAVE_CUDA)
1673   if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1674 #endif
1675   PetscFunctionReturn(PETSC_SUCCESS);
1676 }
1677 
1678 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1679 {
1680   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1681 
1682   PetscFunctionBegin;
1683   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1684   PetscCall(MatShift_Basic(Y, a));
1685   PetscFunctionReturn(PETSC_SUCCESS);
1686 }
1687 
1688 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1689 {
1690   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1691   PetscScalar       *x, sum, *t;
1692   const MatScalar   *idiag = NULL, *mdiag;
1693   const PetscScalar *b, *xb;
1694   PetscInt           n, m = A->rmap->n, i, j, shift;
1695   const PetscInt    *diag;
1696 
1697   PetscFunctionBegin;
1698   its = its * lits;
1699 
1700   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1701   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1702   a->fshift = fshift;
1703   a->omega  = omega;
1704 
1705   diag  = a->diag;
1706   t     = a->ssor_work;
1707   idiag = a->idiag;
1708   mdiag = a->mdiag;
1709 
1710   PetscCall(VecGetArray(xx, &x));
1711   PetscCall(VecGetArrayRead(bb, &b));
1712   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1713   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1714   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1715   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1716 
1717   if (flag & SOR_ZERO_INITIAL_GUESS) {
1718     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1719       for (i = 0; i < m; i++) {
1720         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1721         sum   = b[i];
1722         n     = (diag[i] - shift) / a->sliceheight;
1723         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1724         t[i] = sum;
1725         x[i] = sum * idiag[i];
1726       }
1727       xb = t;
1728       PetscCall(PetscLogFlops(a->nz));
1729     } else xb = b;
1730     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1731       for (i = m - 1; i >= 0; i--) {
1732         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1733         sum   = xb[i];
1734         n     = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1735         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1736         if (xb == b) {
1737           x[i] = sum * idiag[i];
1738         } else {
1739           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1740         }
1741       }
1742       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1743     }
1744     its--;
1745   }
1746   while (its--) {
1747     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1748       for (i = 0; i < m; i++) {
1749         /* lower */
1750         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1751         sum   = b[i];
1752         n     = (diag[i] - shift) / a->sliceheight;
1753         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1754         t[i] = sum; /* save application of the lower-triangular part */
1755         /* upper */
1756         n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1757         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1758         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1759       }
1760       xb = t;
1761       PetscCall(PetscLogFlops(2.0 * a->nz));
1762     } else xb = b;
1763     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1764       for (i = m - 1; i >= 0; i--) {
1765         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1766         sum   = xb[i];
1767         if (xb == b) {
1768           /* whole matrix (no checkpointing available) */
1769           n = a->rlen[i];
1770           for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1771           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1772         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1773           n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1774           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1775           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1776         }
1777       }
1778       if (xb == b) {
1779         PetscCall(PetscLogFlops(2.0 * a->nz));
1780       } else {
1781         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1782       }
1783     }
1784   }
1785   PetscCall(VecRestoreArray(xx, &x));
1786   PetscCall(VecRestoreArrayRead(bb, &b));
1787   PetscFunctionReturn(PETSC_SUCCESS);
1788 }
1789 
1790 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1791                                        MatGetRow_SeqSELL,
1792                                        MatRestoreRow_SeqSELL,
1793                                        MatMult_SeqSELL,
1794                                        /* 4*/ MatMultAdd_SeqSELL,
1795                                        MatMultTranspose_SeqSELL,
1796                                        MatMultTransposeAdd_SeqSELL,
1797                                        NULL,
1798                                        NULL,
1799                                        NULL,
1800                                        /* 10*/ NULL,
1801                                        NULL,
1802                                        NULL,
1803                                        MatSOR_SeqSELL,
1804                                        NULL,
1805                                        /* 15*/ MatGetInfo_SeqSELL,
1806                                        MatEqual_SeqSELL,
1807                                        MatGetDiagonal_SeqSELL,
1808                                        MatDiagonalScale_SeqSELL,
1809                                        NULL,
1810                                        /* 20*/ NULL,
1811                                        MatAssemblyEnd_SeqSELL,
1812                                        MatSetOption_SeqSELL,
1813                                        MatZeroEntries_SeqSELL,
1814                                        /* 24*/ NULL,
1815                                        NULL,
1816                                        NULL,
1817                                        NULL,
1818                                        NULL,
1819                                        /* 29*/ MatSetUp_SeqSELL,
1820                                        NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        NULL,
1824                                        /* 34*/ MatDuplicate_SeqSELL,
1825                                        NULL,
1826                                        NULL,
1827                                        NULL,
1828                                        NULL,
1829                                        /* 39*/ NULL,
1830                                        NULL,
1831                                        NULL,
1832                                        MatGetValues_SeqSELL,
1833                                        MatCopy_SeqSELL,
1834                                        /* 44*/ NULL,
1835                                        MatScale_SeqSELL,
1836                                        MatShift_SeqSELL,
1837                                        NULL,
1838                                        NULL,
1839                                        /* 49*/ NULL,
1840                                        NULL,
1841                                        NULL,
1842                                        NULL,
1843                                        NULL,
1844                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1845                                        NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        NULL,
1849                                        /* 59*/ NULL,
1850                                        MatDestroy_SeqSELL,
1851                                        MatView_SeqSELL,
1852                                        NULL,
1853                                        NULL,
1854                                        /* 64*/ NULL,
1855                                        NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        NULL,
1859                                        /* 69*/ NULL,
1860                                        NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        NULL,
1864                                        /* 74*/ NULL,
1865                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1866                                        NULL,
1867                                        NULL,
1868                                        NULL,
1869                                        /* 79*/ NULL,
1870                                        NULL,
1871                                        NULL,
1872                                        NULL,
1873                                        NULL,
1874                                        /* 84*/ NULL,
1875                                        NULL,
1876                                        NULL,
1877                                        NULL,
1878                                        NULL,
1879                                        /* 89*/ NULL,
1880                                        NULL,
1881                                        NULL,
1882                                        NULL,
1883                                        NULL,
1884                                        /* 94*/ NULL,
1885                                        NULL,
1886                                        NULL,
1887                                        NULL,
1888                                        NULL,
1889                                        /* 99*/ NULL,
1890                                        NULL,
1891                                        NULL,
1892                                        MatConjugate_SeqSELL,
1893                                        NULL,
1894                                        /*104*/ NULL,
1895                                        NULL,
1896                                        NULL,
1897                                        NULL,
1898                                        NULL,
1899                                        /*109*/ NULL,
1900                                        NULL,
1901                                        NULL,
1902                                        NULL,
1903                                        MatMissingDiagonal_SeqSELL,
1904                                        /*114*/ NULL,
1905                                        NULL,
1906                                        NULL,
1907                                        NULL,
1908                                        NULL,
1909                                        /*119*/ NULL,
1910                                        NULL,
1911                                        NULL,
1912                                        NULL,
1913                                        NULL,
1914                                        /*124*/ NULL,
1915                                        NULL,
1916                                        NULL,
1917                                        NULL,
1918                                        NULL,
1919                                        /*129*/ NULL,
1920                                        NULL,
1921                                        NULL,
1922                                        NULL,
1923                                        NULL,
1924                                        /*134*/ NULL,
1925                                        NULL,
1926                                        NULL,
1927                                        NULL,
1928                                        NULL,
1929                                        /*139*/ NULL,
1930                                        NULL,
1931                                        NULL,
1932                                        MatFDColoringSetUp_SeqXAIJ,
1933                                        NULL,
1934                                        /*144*/ NULL,
1935                                        NULL,
1936                                        NULL,
1937                                        NULL,
1938                                        NULL,
1939                                        NULL,
1940                                        /*150*/ NULL,
1941                                        NULL,
1942                                        NULL};
1943 
1944 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1945 {
1946   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1947 
1948   PetscFunctionBegin;
1949   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1950 
1951   /* allocate space for values if not already there */
1952   if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1953 
1954   /* copy values over */
1955   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1956   PetscFunctionReturn(PETSC_SUCCESS);
1957 }
1958 
1959 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1960 {
1961   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1962 
1963   PetscFunctionBegin;
1964   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1965   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1966   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1967   PetscFunctionReturn(PETSC_SUCCESS);
1968 }
1969 
1970 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
1971 {
1972   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1973 
1974   PetscFunctionBegin;
1975   if (a->totalslices && a->sliidx[a->totalslices]) {
1976     *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
1977   } else {
1978     *ratio = 0.0;
1979   }
1980   PetscFunctionReturn(PETSC_SUCCESS);
1981 }
1982 
1983 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
1984 {
1985   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1986   PetscInt     i, current_slicewidth;
1987 
1988   PetscFunctionBegin;
1989   *slicewidth = 0;
1990   for (i = 0; i < a->totalslices; i++) {
1991     current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
1992     if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
1993   }
1994   PetscFunctionReturn(PETSC_SUCCESS);
1995 }
1996 
1997 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
1998 {
1999   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2000 
2001   PetscFunctionBegin;
2002   *slicewidth = 0;
2003   if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; }
2004   PetscFunctionReturn(PETSC_SUCCESS);
2005 }
2006 
2007 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance)
2008 {
2009   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2010   PetscReal    mean;
2011   PetscInt     i, totalslices = a->totalslices, *sliidx = a->sliidx;
2012 
2013   PetscFunctionBegin;
2014   *variance = 0;
2015   if (totalslices) {
2016     mean = (PetscReal)sliidx[totalslices] / totalslices;
2017     for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; }
2018   }
2019   PetscFunctionReturn(PETSC_SUCCESS);
2020 }
2021 
2022 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2023 {
2024   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2025 
2026   PetscFunctionBegin;
2027   if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2028   PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2029   a->sliceheight = sliceheight;
2030 #if defined(PETSC_HAVE_CUDA)
2031   PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight);
2032 #endif
2033   PetscFunctionReturn(PETSC_SUCCESS);
2034 }
2035 
2036 /*@C
2037   MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2038 
2039   Not Collective
2040 
2041   Input Parameter:
2042 . A - a MATSEQSELL matrix
2043 
2044   Output Parameter:
2045 . ratio - ratio of number of padded zeros to number of allocated elements
2046 
2047   Level: intermediate
2048 
2049 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2050 @*/
2051 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2052 {
2053   PetscFunctionBegin;
2054   PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2055   PetscFunctionReturn(PETSC_SUCCESS);
2056 }
2057 
2058 /*@C
2059   MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2060 
2061   Not Collective
2062 
2063   Input Parameter:
2064 . A - a MATSEQSELL matrix
2065 
2066   Output Parameter:
2067 . slicewidth - maximum slice width
2068 
2069   Level: intermediate
2070 
2071 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2072 @*/
2073 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2074 {
2075   PetscFunctionBegin;
2076   PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2077   PetscFunctionReturn(PETSC_SUCCESS);
2078 }
2079 
2080 /*@C
2081   MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2082 
2083   Not Collective
2084 
2085   Input Parameter:
2086 . A - a MATSEQSELL matrix
2087 
2088   Output Parameter:
2089 . slicewidth - average slice width
2090 
2091   Level: intermediate
2092 
2093 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()`
2094 @*/
2095 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2096 {
2097   PetscFunctionBegin;
2098   PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2099   PetscFunctionReturn(PETSC_SUCCESS);
2100 }
2101 
2102 /*@C
2103   MatSeqSELLSetSliceHeight - sets the slice height.
2104 
2105   Not Collective
2106 
2107   Input Parameters:
2108 + A           - a MATSEQSELL matrix
2109 - sliceheight - slice height
2110 
2111   Notes:
2112   You cannot change the slice height once it have been set.
2113 
2114   The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2115 
2116   Level: intermediate
2117 
2118 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()`
2119 @*/
2120 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2121 {
2122   PetscFunctionBegin;
2123   PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2124   PetscFunctionReturn(PETSC_SUCCESS);
2125 }
2126 
2127 /*@C
2128   MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2129 
2130   Not Collective
2131 
2132   Input Parameter:
2133 . A - a MATSEQSELL matrix
2134 
2135   Output Parameter:
2136 . variance - variance of the slice size
2137 
2138   Level: intermediate
2139 
2140 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()`
2141 @*/
2142 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2143 {
2144   PetscFunctionBegin;
2145   PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2146   PetscFunctionReturn(PETSC_SUCCESS);
2147 }
2148 
2149 #if defined(PETSC_HAVE_CUDA)
2150 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2151 #endif
2152 
2153 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2154 {
2155   Mat_SeqSELL *b;
2156   PetscMPIInt  size;
2157 
2158   PetscFunctionBegin;
2159   PetscCall(PetscCitationsRegister(citation, &cited));
2160   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2161   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2162 
2163   PetscCall(PetscNew(&b));
2164 
2165   B->data   = (void *)b;
2166   B->ops[0] = MatOps_Values;
2167 
2168   b->row                = NULL;
2169   b->col                = NULL;
2170   b->icol               = NULL;
2171   b->reallocs           = 0;
2172   b->ignorezeroentries  = PETSC_FALSE;
2173   b->roworiented        = PETSC_TRUE;
2174   b->nonew              = 0;
2175   b->diag               = NULL;
2176   b->solve_work         = NULL;
2177   B->spptr              = NULL;
2178   b->saved_values       = NULL;
2179   b->idiag              = NULL;
2180   b->mdiag              = NULL;
2181   b->ssor_work          = NULL;
2182   b->omega              = 1.0;
2183   b->fshift             = 0.0;
2184   b->idiagvalid         = PETSC_FALSE;
2185   b->keepnonzeropattern = PETSC_FALSE;
2186   b->sliceheight        = 0;
2187 
2188   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2189   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2190   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2191   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2192   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2193   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2194   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
2195 #if defined(PETSC_HAVE_CUDA)
2196   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA));
2197 #endif
2198   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2199   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2200   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2201   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL));
2202   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2203 
2204   PetscObjectOptionsBegin((PetscObject)B);
2205   {
2206     PetscInt  newsh = -1;
2207     PetscBool flg;
2208 #if defined(PETSC_HAVE_CUDA)
2209     PetscInt chunksize = 0;
2210 #endif
2211 
2212     PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2213     if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); }
2214 #if defined(PETSC_HAVE_CUDA)
2215     PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg));
2216     if (flg) {
2217       PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize);
2218       b->chunksize = chunksize;
2219     }
2220 #endif
2221   }
2222   PetscOptionsEnd();
2223   PetscFunctionReturn(PETSC_SUCCESS);
2224 }
2225 
2226 /*
2227  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2228  */
2229 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2230 {
2231   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2232   PetscInt     i, m                           = A->rmap->n;
2233   PetscInt     totalslices = a->totalslices;
2234 
2235   PetscFunctionBegin;
2236   C->factortype = A->factortype;
2237   c->row        = NULL;
2238   c->col        = NULL;
2239   c->icol       = NULL;
2240   c->reallocs   = 0;
2241   C->assembled  = PETSC_TRUE;
2242 
2243   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2244   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2245 
2246   PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen));
2247   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2248 
2249   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2250   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2251 
2252   /* allocate the matrix space */
2253   if (mallocmatspace) {
2254     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2255 
2256     c->singlemalloc = PETSC_TRUE;
2257 
2258     if (m > 0) {
2259       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2260       if (cpvalues == MAT_COPY_VALUES) {
2261         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2262       } else {
2263         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2264       }
2265     }
2266   }
2267 
2268   c->ignorezeroentries = a->ignorezeroentries;
2269   c->roworiented       = a->roworiented;
2270   c->nonew             = a->nonew;
2271   if (a->diag) {
2272     PetscCall(PetscMalloc1(m, &c->diag));
2273     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2274   } else c->diag = NULL;
2275 
2276   c->solve_work         = NULL;
2277   c->saved_values       = NULL;
2278   c->idiag              = NULL;
2279   c->ssor_work          = NULL;
2280   c->keepnonzeropattern = a->keepnonzeropattern;
2281   c->free_val           = PETSC_TRUE;
2282   c->free_colidx        = PETSC_TRUE;
2283 
2284   c->maxallocmat  = a->maxallocmat;
2285   c->maxallocrow  = a->maxallocrow;
2286   c->rlenmax      = a->rlenmax;
2287   c->nz           = a->nz;
2288   C->preallocated = PETSC_TRUE;
2289 
2290   c->nonzerorowcnt = a->nonzerorowcnt;
2291   C->nonzerostate  = A->nonzerostate;
2292 
2293   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2294   PetscFunctionReturn(PETSC_SUCCESS);
2295 }
2296 
2297 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2298 {
2299   PetscFunctionBegin;
2300   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2301   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2302   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2303   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2304   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2305   PetscFunctionReturn(PETSC_SUCCESS);
2306 }
2307 
2308 /*MC
2309    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2310    based on the sliced Ellpack format, {cite}`zhangellpack2018`
2311 
2312    Options Database Key:
2313 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2314 
2315    Level: beginner
2316 
2317 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2318 M*/
2319 
2320 /*MC
2321    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018`
2322 
2323    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2324    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2325   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2326   for communicators controlling multiple processes.  It is recommended that you call both of
2327   the above preallocation routines for simplicity.
2328 
2329    Options Database Key:
2330 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2331 
2332   Level: beginner
2333 
2334   Notes:
2335   This format is only supported for real scalars, double precision, and 32-bit indices (the defaults).
2336 
2337   It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2338   non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2339 
2340   Developer Notes:
2341   On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2342 
2343   The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2344 .vb
2345                             (2 0  3 4)
2346    Consider the matrix A =  (5 0  6 0)
2347                             (0 0  7 8)
2348                             (0 0  9 9)
2349 
2350    symbolically the Ellpack format can be written as
2351 
2352         (2 3 4 |)           (0 2 3 |)
2353    v =  (5 6 0 |)  colidx = (0 2 2 |)
2354         --------            ---------
2355         (7 8 |)             (2 3 |)
2356         (9 9 |)             (2 3 |)
2357 
2358     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2359     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2360     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2361 
2362     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2363 
2364 .ve
2365 
2366     See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance.
2367 
2368 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2369 M*/
2370 
2371 /*@C
2372   MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2373 
2374   Collective
2375 
2376   Input Parameters:
2377 + comm    - MPI communicator, set to `PETSC_COMM_SELF`
2378 . m       - number of rows
2379 . n       - number of columns
2380 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2381 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2382 
2383   Output Parameter:
2384 . A - the matrix
2385 
2386   Level: intermediate
2387 
2388   Notes:
2389   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2390   MatXXXXSetPreallocation() paradigm instead of this routine directly.
2391   [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2392 
2393   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2394   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2395   allocation.
2396 
2397 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL`
2398  @*/
2399 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2400 {
2401   PetscFunctionBegin;
2402   PetscCall(MatCreate(comm, A));
2403   PetscCall(MatSetSizes(*A, m, n, m, n));
2404   PetscCall(MatSetType(*A, MATSEQSELL));
2405   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2406   PetscFunctionReturn(PETSC_SUCCESS);
2407 }
2408 
2409 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2410 {
2411   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2412   PetscInt     totalslices = a->totalslices;
2413 
2414   PetscFunctionBegin;
2415   /* If the  matrix dimensions are not equal,or no of nonzeros */
2416   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2417     *flg = PETSC_FALSE;
2418     PetscFunctionReturn(PETSC_SUCCESS);
2419   }
2420   /* if the a->colidx are the same */
2421   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2422   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2423   /* if a->val are the same */
2424   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2425   PetscFunctionReturn(PETSC_SUCCESS);
2426 }
2427 
2428 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2429 {
2430   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2431 
2432   PetscFunctionBegin;
2433   a->idiagvalid = PETSC_FALSE;
2434   PetscFunctionReturn(PETSC_SUCCESS);
2435 }
2436 
2437 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2438 {
2439 #if defined(PETSC_USE_COMPLEX)
2440   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2441   PetscInt     i;
2442   PetscScalar *val = a->val;
2443 
2444   PetscFunctionBegin;
2445   for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); }
2446   #if defined(PETSC_HAVE_CUDA)
2447   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2448   #endif
2449 #else
2450   PetscFunctionBegin;
2451 #endif
2452   PetscFunctionReturn(PETSC_SUCCESS);
2453 }
2454