xref: /petsc/src/mat/impls/sell/seq/sell.c (revision a4e35b1925eceef64945ea472b84f2bf06a67b5e)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited      = PETSC_FALSE;
10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14                                " year = 2018\n"
15                                "}\n";
16 
17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18 
19   #include <immintrin.h>
20 
21   #if !defined(_MM_SCALE_8)
22     #define _MM_SCALE_8 8
23   #endif
24 
25   #if defined(__AVX512F__)
26     /* these do not work
27    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29   */
30     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
33       vec_vals = _mm512_loadu_pd(aval); \
34       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36   #elif defined(__AVX2__) && defined(__FMA__)
37     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38       vec_vals = _mm256_loadu_pd(aval); \
39       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42   #endif
43 #endif /* PETSC_HAVE_IMMINTRIN_H */
44 
45 /*@C
46   MatSeqSELLSetPreallocation - For good matrix assembly performance
47   the user should preallocate the matrix storage by setting the parameter `nz`
48   (or the array `nnz`).
49 
50   Collective
51 
52   Input Parameters:
53 + B       - The `MATSEQSELL` matrix
54 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
55 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
56 
57   Level: intermediate
58 
59   Notes:
60   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
61   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
62   allocation.
63 
64   You can call `MatGetInfo()` to get information on how effective the preallocation was;
65   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
66   You can also run with the option `-info` and look for messages with the string
67   malloc in them to see if additional memory allocation was needed.
68 
69   Developer Notes:
70   Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
71   entries or columns indices.
72 
73   The maximum number of nonzeos in any row should be as accurate as possible.
74   If it is underestimated, you will get bad performance due to reallocation
75   (`MatSeqXSELLReallocateSELL()`).
76 
77 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
78  @*/
79 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
80 {
81   PetscFunctionBegin;
82   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
83   PetscValidType(B, 1);
84   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
85   PetscFunctionReturn(PETSC_SUCCESS);
86 }
87 
88 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
89 {
90   Mat_SeqSELL *b;
91   PetscInt     i, j, totalslices;
92 #if defined(PETSC_HAVE_CUDA)
93   PetscInt rlenmax = 0;
94 #endif
95   PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
96 
97   PetscFunctionBegin;
98   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
99   if (maxallocrow == MAT_SKIP_ALLOCATION) {
100     skipallocation = PETSC_TRUE;
101     maxallocrow    = 0;
102   }
103 
104   PetscCall(PetscLayoutSetUp(B->rmap));
105   PetscCall(PetscLayoutSetUp(B->cmap));
106 
107   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
108   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
109   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
110   if (rlen) {
111     for (i = 0; i < B->rmap->n; i++) {
112       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
113       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
114     }
115   }
116 
117   B->preallocated = PETSC_TRUE;
118 
119   b = (Mat_SeqSELL *)B->data;
120 
121   if (!b->sliceheight) { /* not set yet */
122 #if defined(PETSC_HAVE_CUDA)
123     b->sliceheight = 16;
124 #else
125     b->sliceheight = 8;
126 #endif
127   }
128   totalslices    = PetscCeilInt(B->rmap->n, b->sliceheight);
129   b->totalslices = totalslices;
130   if (!skipallocation) {
131     if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
132 
133     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
134       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
135     }
136     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
137       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
138       else if (maxallocrow < 0) maxallocrow = 1;
139 #if defined(PETSC_HAVE_CUDA)
140       rlenmax = maxallocrow;
141       /* Pad the slice to DEVICE_MEM_ALIGN */
142       while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
143 #endif
144       for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
145     } else {
146 #if defined(PETSC_HAVE_CUDA)
147       PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
148 #endif
149       maxallocrow  = 0;
150       b->sliidx[0] = 0;
151       for (i = 1; i < totalslices; i++) {
152         b->sliidx[i] = 0;
153         for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); }
154 #if defined(PETSC_HAVE_CUDA)
155         rlenmax = PetscMax(b->sliidx[i], rlenmax);
156         /* Pad the slice to DEVICE_MEM_ALIGN */
157         b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
158 #endif
159         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
160         PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
161       }
162       /* last slice */
163       b->sliidx[totalslices] = 0;
164       for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
165 #if defined(PETSC_HAVE_CUDA)
166       rlenmax                = PetscMax(b->sliidx[i], rlenmax);
167       b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
168 #endif
169       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
170       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
171     }
172 
173     /* allocate space for val, colidx, rlen */
174     /* FIXME: should B's old memory be unlogged? */
175     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
176     /* FIXME: assuming an element of the bit array takes 8 bits */
177     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
178     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
179     PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
180 
181     b->singlemalloc = PETSC_TRUE;
182     b->free_val     = PETSC_TRUE;
183     b->free_colidx  = PETSC_TRUE;
184   } else {
185     b->free_val    = PETSC_FALSE;
186     b->free_colidx = PETSC_FALSE;
187   }
188 
189   b->nz          = 0;
190   b->maxallocrow = maxallocrow;
191 #if defined(PETSC_HAVE_CUDA)
192   b->rlenmax = rlenmax;
193 #else
194   b->rlenmax = maxallocrow;
195 #endif
196   b->maxallocmat      = b->sliidx[totalslices];
197   B->info.nz_unneeded = (double)b->maxallocmat;
198   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
199   PetscFunctionReturn(PETSC_SUCCESS);
200 }
201 
202 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
203 {
204   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
205   PetscInt     shift;
206 
207   PetscFunctionBegin;
208   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
209   if (nz) *nz = a->rlen[row];
210   shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
211   if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); }
212   if (idx) {
213     PetscInt j;
214     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
215     *idx = a->getrowcols;
216   }
217   if (v) {
218     PetscInt j;
219     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
220     *v = a->getrowvals;
221   }
222   PetscFunctionReturn(PETSC_SUCCESS);
223 }
224 
225 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
226 {
227   PetscFunctionBegin;
228   PetscFunctionReturn(PETSC_SUCCESS);
229 }
230 
231 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
232 {
233   Mat          B;
234   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
235   PetscInt     i;
236 
237   PetscFunctionBegin;
238   if (reuse == MAT_REUSE_MATRIX) {
239     B = *newmat;
240     PetscCall(MatZeroEntries(B));
241   } else {
242     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
243     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
244     PetscCall(MatSetType(B, MATSEQAIJ));
245     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
246   }
247 
248   for (i = 0; i < A->rmap->n; i++) {
249     PetscInt     nz = 0, *cols = NULL;
250     PetscScalar *vals = NULL;
251 
252     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
253     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
254     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
255   }
256 
257   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
258   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
259   B->rmap->bs = A->rmap->bs;
260 
261   if (reuse == MAT_INPLACE_MATRIX) {
262     PetscCall(MatHeaderReplace(A, &B));
263   } else {
264     *newmat = B;
265   }
266   PetscFunctionReturn(PETSC_SUCCESS);
267 }
268 
269 #include <../src/mat/impls/aij/seq/aij.h>
270 
271 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
272 {
273   Mat                B;
274   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
275   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
276   const PetscInt    *cols;
277   const PetscScalar *vals;
278 
279   PetscFunctionBegin;
280 
281   if (reuse == MAT_REUSE_MATRIX) {
282     B = *newmat;
283   } else {
284     if (PetscDefined(USE_DEBUG) || !a->ilen) {
285       PetscCall(PetscMalloc1(m, &rowlengths));
286       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
287     }
288     if (PetscDefined(USE_DEBUG) && a->ilen) {
289       PetscBool eq;
290       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
291       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
292       PetscCall(PetscFree(rowlengths));
293       rowlengths = a->ilen;
294     } else if (a->ilen) rowlengths = a->ilen;
295     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
296     PetscCall(MatSetSizes(B, m, n, m, n));
297     PetscCall(MatSetType(B, MATSEQSELL));
298     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
299     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
300   }
301 
302   for (row = 0; row < m; row++) {
303     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
304     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
305     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
306   }
307   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
308   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
309   B->rmap->bs = A->rmap->bs;
310 
311   if (reuse == MAT_INPLACE_MATRIX) {
312     PetscCall(MatHeaderReplace(A, &B));
313   } else {
314     *newmat = B;
315   }
316   PetscFunctionReturn(PETSC_SUCCESS);
317 }
318 
319 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
320 {
321   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
322   PetscScalar       *y;
323   const PetscScalar *x;
324   const MatScalar   *aval        = a->val;
325   PetscInt           totalslices = a->totalslices;
326   const PetscInt    *acolidx     = a->colidx;
327   PetscInt           i, j;
328 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
329   __m512d  vec_x, vec_y, vec_vals;
330   __m256i  vec_idx;
331   __mmask8 mask;
332   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
333   __m256i  vec_idx2, vec_idx3, vec_idx4;
334 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
335   __m128i   vec_idx;
336   __m256d   vec_x, vec_y, vec_y2, vec_vals;
337   MatScalar yval;
338   PetscInt  r, rows_left, row, nnz_in_row;
339 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
340   __m128d   vec_x_tmp;
341   __m256d   vec_x, vec_y, vec_y2, vec_vals;
342   MatScalar yval;
343   PetscInt  r, rows_left, row, nnz_in_row;
344 #else
345   PetscInt     k, sliceheight = a->sliceheight;
346   PetscScalar *sum;
347 #endif
348 
349 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
350   #pragma disjoint(*x, *y, *aval)
351 #endif
352 
353   PetscFunctionBegin;
354   PetscCall(VecGetArrayRead(xx, &x));
355   PetscCall(VecGetArray(yy, &y));
356 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
357   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
358   for (i = 0; i < totalslices; i++) { /* loop over slices */
359     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
360     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
361 
362     vec_y  = _mm512_setzero_pd();
363     vec_y2 = _mm512_setzero_pd();
364     vec_y3 = _mm512_setzero_pd();
365     vec_y4 = _mm512_setzero_pd();
366 
367     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
368     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
369     case 3:
370       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
371       acolidx += 8;
372       aval += 8;
373       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
374       acolidx += 8;
375       aval += 8;
376       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
377       acolidx += 8;
378       aval += 8;
379       j += 3;
380       break;
381     case 2:
382       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
383       acolidx += 8;
384       aval += 8;
385       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
386       acolidx += 8;
387       aval += 8;
388       j += 2;
389       break;
390     case 1:
391       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
392       acolidx += 8;
393       aval += 8;
394       j += 1;
395       break;
396     }
397   #pragma novector
398     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
399       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
400       acolidx += 8;
401       aval += 8;
402       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
403       acolidx += 8;
404       aval += 8;
405       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
406       acolidx += 8;
407       aval += 8;
408       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
409       acolidx += 8;
410       aval += 8;
411     }
412 
413     vec_y = _mm512_add_pd(vec_y, vec_y2);
414     vec_y = _mm512_add_pd(vec_y, vec_y3);
415     vec_y = _mm512_add_pd(vec_y, vec_y4);
416     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
417       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
418       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
419     } else {
420       _mm512_storeu_pd(&y[8 * i], vec_y);
421     }
422   }
423 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
424   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
425   for (i = 0; i < totalslices; i++) { /* loop over full slices */
426     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
427     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
428 
429     /* last slice may have padding rows. Don't use vectorization. */
430     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
431       rows_left = A->rmap->n - 8 * i;
432       for (r = 0; r < rows_left; ++r) {
433         yval       = (MatScalar)0;
434         row        = 8 * i + r;
435         nnz_in_row = a->rlen[row];
436         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
437         y[row] = yval;
438       }
439       break;
440     }
441 
442     vec_y  = _mm256_setzero_pd();
443     vec_y2 = _mm256_setzero_pd();
444 
445   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
446   #pragma novector
447   #pragma unroll(2)
448     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
449       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
450       aval += 4;
451       acolidx += 4;
452       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
453       aval += 4;
454       acolidx += 4;
455     }
456 
457     _mm256_storeu_pd(y + i * 8, vec_y);
458     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
459   }
460 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
461   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
462   for (i = 0; i < totalslices; i++) { /* loop over full slices */
463     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
464     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
465 
466     vec_y  = _mm256_setzero_pd();
467     vec_y2 = _mm256_setzero_pd();
468 
469     /* last slice may have padding rows. Don't use vectorization. */
470     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
471       rows_left = A->rmap->n - 8 * i;
472       for (r = 0; r < rows_left; ++r) {
473         yval       = (MatScalar)0;
474         row        = 8 * i + r;
475         nnz_in_row = a->rlen[row];
476         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
477         y[row] = yval;
478       }
479       break;
480     }
481 
482   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
483   #pragma novector
484   #pragma unroll(2)
485     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
486       vec_vals  = _mm256_loadu_pd(aval);
487       vec_x_tmp = _mm_setzero_pd();
488       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
489       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
490       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
491       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
492       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
493       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
494       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
495       aval += 4;
496 
497       vec_vals  = _mm256_loadu_pd(aval);
498       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
499       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
500       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
501       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
502       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
503       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
504       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
505       aval += 4;
506     }
507 
508     _mm256_storeu_pd(y + i * 8, vec_y);
509     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
510   }
511 #else
512   PetscCall(PetscMalloc1(sliceheight, &sum));
513   for (i = 0; i < totalslices; i++) { /* loop over slices */
514     for (j = 0; j < sliceheight; j++) {
515       sum[j] = 0.0;
516       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
517     }
518     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
519       for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
520     } else {
521       for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
522     }
523   }
524   PetscCall(PetscFree(sum));
525 #endif
526 
527   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
528   PetscCall(VecRestoreArrayRead(xx, &x));
529   PetscCall(VecRestoreArray(yy, &y));
530   PetscFunctionReturn(PETSC_SUCCESS);
531 }
532 
533 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
534 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
535 {
536   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
537   PetscScalar       *y, *z;
538   const PetscScalar *x;
539   const MatScalar   *aval        = a->val;
540   PetscInt           totalslices = a->totalslices;
541   const PetscInt    *acolidx     = a->colidx;
542   PetscInt           i, j;
543 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
544   __m512d  vec_x, vec_y, vec_vals;
545   __m256i  vec_idx;
546   __mmask8 mask = 0;
547   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
548   __m256i  vec_idx2, vec_idx3, vec_idx4;
549 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
550   __m128d   vec_x_tmp;
551   __m256d   vec_x, vec_y, vec_y2, vec_vals;
552   MatScalar yval;
553   PetscInt  r, row, nnz_in_row;
554 #else
555   PetscInt     k, sliceheight = a->sliceheight;
556   PetscScalar *sum;
557 #endif
558 
559 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
560   #pragma disjoint(*x, *y, *aval)
561 #endif
562 
563   PetscFunctionBegin;
564   if (!a->nz) {
565     PetscCall(VecCopy(yy, zz));
566     PetscFunctionReturn(PETSC_SUCCESS);
567   }
568   PetscCall(VecGetArrayRead(xx, &x));
569   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
570 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
571   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
572   for (i = 0; i < totalslices; i++) { /* loop over slices */
573     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
574     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
575 
576     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
577       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
578       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
579     } else {
580       vec_y = _mm512_loadu_pd(&y[8 * i]);
581     }
582     vec_y2 = _mm512_setzero_pd();
583     vec_y3 = _mm512_setzero_pd();
584     vec_y4 = _mm512_setzero_pd();
585 
586     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
587     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
588     case 3:
589       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
590       acolidx += 8;
591       aval += 8;
592       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
593       acolidx += 8;
594       aval += 8;
595       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
596       acolidx += 8;
597       aval += 8;
598       j += 3;
599       break;
600     case 2:
601       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
602       acolidx += 8;
603       aval += 8;
604       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
605       acolidx += 8;
606       aval += 8;
607       j += 2;
608       break;
609     case 1:
610       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
611       acolidx += 8;
612       aval += 8;
613       j += 1;
614       break;
615     }
616   #pragma novector
617     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
618       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
619       acolidx += 8;
620       aval += 8;
621       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
622       acolidx += 8;
623       aval += 8;
624       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
625       acolidx += 8;
626       aval += 8;
627       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
628       acolidx += 8;
629       aval += 8;
630     }
631 
632     vec_y = _mm512_add_pd(vec_y, vec_y2);
633     vec_y = _mm512_add_pd(vec_y, vec_y3);
634     vec_y = _mm512_add_pd(vec_y, vec_y4);
635     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
636       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
637     } else {
638       _mm512_storeu_pd(&z[8 * i], vec_y);
639     }
640   }
641 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
642   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
643   for (i = 0; i < totalslices; i++) { /* loop over full slices */
644     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
645     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
646 
647     /* last slice may have padding rows. Don't use vectorization. */
648     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
649       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
650         row        = 8 * i + r;
651         yval       = (MatScalar)0.0;
652         nnz_in_row = a->rlen[row];
653         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
654         z[row] = y[row] + yval;
655       }
656       break;
657     }
658 
659     vec_y  = _mm256_loadu_pd(y + 8 * i);
660     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
661 
662     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
663     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
664       vec_vals  = _mm256_loadu_pd(aval);
665       vec_x_tmp = _mm_setzero_pd();
666       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
667       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
668       vec_x     = _mm256_setzero_pd();
669       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
670       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
671       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
672       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
673       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
674       aval += 4;
675 
676       vec_vals  = _mm256_loadu_pd(aval);
677       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
678       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
679       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
680       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
681       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
682       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
683       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
684       aval += 4;
685     }
686 
687     _mm256_storeu_pd(z + i * 8, vec_y);
688     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
689   }
690 #else
691   PetscCall(PetscMalloc1(sliceheight, &sum));
692   for (i = 0; i < totalslices; i++) { /* loop over slices */
693     for (j = 0; j < sliceheight; j++) {
694       sum[j] = 0.0;
695       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
696     }
697     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
698       for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
699     } else {
700       for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
701     }
702   }
703   PetscCall(PetscFree(sum));
704 #endif
705 
706   PetscCall(PetscLogFlops(2.0 * a->nz));
707   PetscCall(VecRestoreArrayRead(xx, &x));
708   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
709   PetscFunctionReturn(PETSC_SUCCESS);
710 }
711 
712 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
713 {
714   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
715   PetscScalar       *y;
716   const PetscScalar *x;
717   const MatScalar   *aval    = a->val;
718   const PetscInt    *acolidx = a->colidx;
719   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
720 
721 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
722   #pragma disjoint(*x, *y, *aval)
723 #endif
724 
725   PetscFunctionBegin;
726   if (A->symmetric == PETSC_BOOL3_TRUE) {
727     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
728     PetscFunctionReturn(PETSC_SUCCESS);
729   }
730   if (zz != yy) PetscCall(VecCopy(zz, yy));
731 
732   if (a->nz) {
733     PetscCall(VecGetArrayRead(xx, &x));
734     PetscCall(VecGetArray(yy, &y));
735     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
736       if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
737         for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
738           row        = sliceheight * i + r;
739           nnz_in_row = a->rlen[row];
740           for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
741         }
742         break;
743       }
744       for (r = 0; r < sliceheight; ++r)
745         for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
746     }
747     PetscCall(PetscLogFlops(2.0 * a->nz));
748     PetscCall(VecRestoreArrayRead(xx, &x));
749     PetscCall(VecRestoreArray(yy, &y));
750   }
751   PetscFunctionReturn(PETSC_SUCCESS);
752 }
753 
754 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
755 {
756   PetscFunctionBegin;
757   if (A->symmetric == PETSC_BOOL3_TRUE) {
758     PetscCall(MatMult_SeqSELL(A, xx, yy));
759   } else {
760     PetscCall(VecSet(yy, 0.0));
761     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
762   }
763   PetscFunctionReturn(PETSC_SUCCESS);
764 }
765 
766 /*
767      Checks for missing diagonals
768 */
769 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
770 {
771   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
772   PetscInt    *diag, i;
773 
774   PetscFunctionBegin;
775   *missing = PETSC_FALSE;
776   if (A->rmap->n > 0 && !(a->colidx)) {
777     *missing = PETSC_TRUE;
778     if (d) *d = 0;
779     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
780   } else {
781     diag = a->diag;
782     for (i = 0; i < A->rmap->n; i++) {
783       if (diag[i] == -1) {
784         *missing = PETSC_TRUE;
785         if (d) *d = i;
786         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
787         break;
788       }
789     }
790   }
791   PetscFunctionReturn(PETSC_SUCCESS);
792 }
793 
794 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
795 {
796   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
797   PetscInt     i, j, m = A->rmap->n, shift;
798 
799   PetscFunctionBegin;
800   if (!a->diag) {
801     PetscCall(PetscMalloc1(m, &a->diag));
802     a->free_diag = PETSC_TRUE;
803   }
804   for (i = 0; i < m; i++) {                                          /* loop over rows */
805     shift      = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
806     a->diag[i] = -1;
807     for (j = 0; j < a->rlen[i]; j++) {
808       if (a->colidx[shift + a->sliceheight * j] == i) {
809         a->diag[i] = shift + a->sliceheight * j;
810         break;
811       }
812     }
813   }
814   PetscFunctionReturn(PETSC_SUCCESS);
815 }
816 
817 /*
818   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
819 */
820 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
821 {
822   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
823   PetscInt     i, *diag, m = A->rmap->n;
824   MatScalar   *val = a->val;
825   PetscScalar *idiag, *mdiag;
826 
827   PetscFunctionBegin;
828   if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
829   PetscCall(MatMarkDiagonal_SeqSELL(A));
830   diag = a->diag;
831   if (!a->idiag) {
832     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
833     val = a->val;
834   }
835   mdiag = a->mdiag;
836   idiag = a->idiag;
837 
838   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
839     for (i = 0; i < m; i++) {
840       mdiag[i] = val[diag[i]];
841       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
842         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
843         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
844         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
845         A->factorerror_zeropivot_value = 0.0;
846         A->factorerror_zeropivot_row   = i;
847       }
848       idiag[i] = 1.0 / val[diag[i]];
849     }
850     PetscCall(PetscLogFlops(m));
851   } else {
852     for (i = 0; i < m; i++) {
853       mdiag[i] = val[diag[i]];
854       idiag[i] = omega / (fshift + val[diag[i]]);
855     }
856     PetscCall(PetscLogFlops(2.0 * m));
857   }
858   a->idiagvalid = PETSC_TRUE;
859   PetscFunctionReturn(PETSC_SUCCESS);
860 }
861 
862 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
863 {
864   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
865 
866   PetscFunctionBegin;
867   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
868   PetscCall(MatSeqSELLInvalidateDiagonal(A));
869   PetscFunctionReturn(PETSC_SUCCESS);
870 }
871 
872 PetscErrorCode MatDestroy_SeqSELL(Mat A)
873 {
874   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
875 
876   PetscFunctionBegin;
877   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
878   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
879   PetscCall(ISDestroy(&a->row));
880   PetscCall(ISDestroy(&a->col));
881   PetscCall(PetscFree(a->diag));
882   PetscCall(PetscFree(a->rlen));
883   PetscCall(PetscFree(a->sliidx));
884   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
885   PetscCall(PetscFree(a->solve_work));
886   PetscCall(ISDestroy(&a->icol));
887   PetscCall(PetscFree(a->saved_values));
888   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
889   PetscCall(PetscFree(A->data));
890 #if defined(PETSC_HAVE_CUDA)
891   PetscCall(PetscFree(a->chunk_slice_map));
892 #endif
893 
894   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
895   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
896   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
897   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
898   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
899   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
900   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
901 #if defined(PETSC_HAVE_CUDA)
902   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL));
903 #endif
904   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
905   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
906   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
907   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL));
908   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
909   PetscFunctionReturn(PETSC_SUCCESS);
910 }
911 
912 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
913 {
914   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
915 
916   PetscFunctionBegin;
917   switch (op) {
918   case MAT_ROW_ORIENTED:
919     a->roworiented = flg;
920     break;
921   case MAT_KEEP_NONZERO_PATTERN:
922     a->keepnonzeropattern = flg;
923     break;
924   case MAT_NEW_NONZERO_LOCATIONS:
925     a->nonew = (flg ? 0 : 1);
926     break;
927   case MAT_NEW_NONZERO_LOCATION_ERR:
928     a->nonew = (flg ? -1 : 0);
929     break;
930   case MAT_NEW_NONZERO_ALLOCATION_ERR:
931     a->nonew = (flg ? -2 : 0);
932     break;
933   case MAT_UNUSED_NONZERO_LOCATION_ERR:
934     a->nounused = (flg ? -1 : 0);
935     break;
936   case MAT_FORCE_DIAGONAL_ENTRIES:
937   case MAT_IGNORE_OFF_PROC_ENTRIES:
938   case MAT_USE_HASH_TABLE:
939   case MAT_SORTED_FULL:
940     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
941     break;
942   case MAT_SPD:
943   case MAT_SYMMETRIC:
944   case MAT_STRUCTURALLY_SYMMETRIC:
945   case MAT_HERMITIAN:
946   case MAT_SYMMETRY_ETERNAL:
947   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
948   case MAT_SPD_ETERNAL:
949     /* These options are handled directly by MatSetOption() */
950     break;
951   default:
952     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
953   }
954   PetscFunctionReturn(PETSC_SUCCESS);
955 }
956 
957 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
958 {
959   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
960   PetscInt     i, j, n, shift;
961   PetscScalar *x, zero = 0.0;
962 
963   PetscFunctionBegin;
964   PetscCall(VecGetLocalSize(v, &n));
965   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
966 
967   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
968     PetscInt *diag = a->diag;
969     PetscCall(VecGetArray(v, &x));
970     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
971     PetscCall(VecRestoreArray(v, &x));
972     PetscFunctionReturn(PETSC_SUCCESS);
973   }
974 
975   PetscCall(VecSet(v, zero));
976   PetscCall(VecGetArray(v, &x));
977   for (i = 0; i < n; i++) {                                     /* loop over rows */
978     shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
979     x[i]  = 0;
980     for (j = 0; j < a->rlen[i]; j++) {
981       if (a->colidx[shift + a->sliceheight * j] == i) {
982         x[i] = a->val[shift + a->sliceheight * j];
983         break;
984       }
985     }
986   }
987   PetscCall(VecRestoreArray(v, &x));
988   PetscFunctionReturn(PETSC_SUCCESS);
989 }
990 
991 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
992 {
993   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
994   const PetscScalar *l, *r;
995   PetscInt           i, j, m, n, row;
996 
997   PetscFunctionBegin;
998   if (ll) {
999     /* The local size is used so that VecMPI can be passed to this routine
1000        by MatDiagonalScale_MPISELL */
1001     PetscCall(VecGetLocalSize(ll, &m));
1002     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1003     PetscCall(VecGetArrayRead(ll, &l));
1004     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1005       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1006         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1007           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
1008         }
1009       } else {
1010         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; }
1011       }
1012     }
1013     PetscCall(VecRestoreArrayRead(ll, &l));
1014     PetscCall(PetscLogFlops(a->nz));
1015   }
1016   if (rr) {
1017     PetscCall(VecGetLocalSize(rr, &n));
1018     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1019     PetscCall(VecGetArrayRead(rr, &r));
1020     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1021       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1022         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1023           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1024         }
1025       } else {
1026         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1027       }
1028     }
1029     PetscCall(VecRestoreArrayRead(rr, &r));
1030     PetscCall(PetscLogFlops(a->nz));
1031   }
1032   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1033 #if defined(PETSC_HAVE_CUDA)
1034   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1035 #endif
1036   PetscFunctionReturn(PETSC_SUCCESS);
1037 }
1038 
1039 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1040 {
1041   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1042   PetscInt    *cp, i, k, low, high, t, row, col, l;
1043   PetscInt     shift;
1044   MatScalar   *vp;
1045 
1046   PetscFunctionBegin;
1047   for (k = 0; k < m; k++) { /* loop over requested rows */
1048     row = im[k];
1049     if (row < 0) continue;
1050     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1051     shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1052     cp    = a->colidx + shift;                                        /* pointer to the row */
1053     vp    = a->val + shift;                                           /* pointer to the row */
1054     for (l = 0; l < n; l++) {                                         /* loop over requested columns */
1055       col = in[l];
1056       if (col < 0) continue;
1057       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1058       high = a->rlen[row];
1059       low  = 0; /* assume unsorted */
1060       while (high - low > 5) {
1061         t = (low + high) / 2;
1062         if (*(cp + a->sliceheight * t) > col) high = t;
1063         else low = t;
1064       }
1065       for (i = low; i < high; i++) {
1066         if (*(cp + a->sliceheight * i) > col) break;
1067         if (*(cp + a->sliceheight * i) == col) {
1068           *v++ = *(vp + a->sliceheight * i);
1069           goto finished;
1070         }
1071       }
1072       *v++ = 0.0;
1073     finished:;
1074     }
1075   }
1076   PetscFunctionReturn(PETSC_SUCCESS);
1077 }
1078 
1079 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1080 {
1081   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1082   PetscInt          i, j, m = A->rmap->n, shift;
1083   const char       *name;
1084   PetscViewerFormat format;
1085 
1086   PetscFunctionBegin;
1087   PetscCall(PetscViewerGetFormat(viewer, &format));
1088   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1089     PetscInt nofinalvalue = 0;
1090     /*
1091     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1092       nofinalvalue = 1;
1093     }
1094     */
1095     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1096     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1097     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1098 #if defined(PETSC_USE_COMPLEX)
1099     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1100 #else
1101     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1102 #endif
1103     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1104 
1105     for (i = 0; i < m; i++) {
1106       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1107       for (j = 0; j < a->rlen[i]; j++) {
1108 #if defined(PETSC_USE_COMPLEX)
1109         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1110 #else
1111         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1112 #endif
1113       }
1114     }
1115     /*
1116     if (nofinalvalue) {
1117 #if defined(PETSC_USE_COMPLEX)
1118       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1119 #else
1120       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1121 #endif
1122     }
1123     */
1124     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1125     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1126     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1127   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1128     PetscFunctionReturn(PETSC_SUCCESS);
1129   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1130     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1131     for (i = 0; i < m; i++) {
1132       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1133       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1134       for (j = 0; j < a->rlen[i]; j++) {
1135 #if defined(PETSC_USE_COMPLEX)
1136         if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1137           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1138         } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1139           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1140         } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1141           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1142         }
1143 #else
1144         if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1145 #endif
1146       }
1147       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1148     }
1149     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1150   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1151     PetscInt    cnt = 0, jcnt;
1152     PetscScalar value;
1153 #if defined(PETSC_USE_COMPLEX)
1154     PetscBool realonly = PETSC_TRUE;
1155     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1156       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1157         realonly = PETSC_FALSE;
1158         break;
1159       }
1160     }
1161 #endif
1162 
1163     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1164     for (i = 0; i < m; i++) {
1165       jcnt  = 0;
1166       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1167       for (j = 0; j < A->cmap->n; j++) {
1168         if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1169           value = a->val[cnt++];
1170           jcnt++;
1171         } else {
1172           value = 0.0;
1173         }
1174 #if defined(PETSC_USE_COMPLEX)
1175         if (realonly) {
1176           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1177         } else {
1178           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1179         }
1180 #else
1181         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1182 #endif
1183       }
1184       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1185     }
1186     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1187   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1188     PetscInt fshift = 1;
1189     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1190 #if defined(PETSC_USE_COMPLEX)
1191     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1192 #else
1193     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1194 #endif
1195     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1196     for (i = 0; i < m; i++) {
1197       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1198       for (j = 0; j < a->rlen[i]; j++) {
1199 #if defined(PETSC_USE_COMPLEX)
1200         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1201 #else
1202         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1203 #endif
1204       }
1205     }
1206     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1207   } else if (format == PETSC_VIEWER_NATIVE) {
1208     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1209       PetscInt row;
1210       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1211       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1212 #if defined(PETSC_USE_COMPLEX)
1213         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1214           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1215         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1216           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1217         } else {
1218           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1219         }
1220 #else
1221         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1222 #endif
1223       }
1224     }
1225   } else {
1226     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1227     if (A->factortype) {
1228       for (i = 0; i < m; i++) {
1229         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1230         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1231         /* L part */
1232         for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1233 #if defined(PETSC_USE_COMPLEX)
1234           if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1235             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1236           } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1237             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1238           } else {
1239             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1240           }
1241 #else
1242           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1243 #endif
1244         }
1245         /* diagonal */
1246         j = a->diag[i];
1247 #if defined(PETSC_USE_COMPLEX)
1248         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1249           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1250         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1251           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1252         } else {
1253           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1254         }
1255 #else
1256         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1257 #endif
1258 
1259         /* U part */
1260         for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1261 #if defined(PETSC_USE_COMPLEX)
1262           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1263             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1264           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1265             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1266           } else {
1267             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1268           }
1269 #else
1270           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1271 #endif
1272         }
1273         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1274       }
1275     } else {
1276       for (i = 0; i < m; i++) {
1277         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1278         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1279         for (j = 0; j < a->rlen[i]; j++) {
1280 #if defined(PETSC_USE_COMPLEX)
1281           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1282             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1283           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1284             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1285           } else {
1286             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1287           }
1288 #else
1289           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1290 #endif
1291         }
1292         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1293       }
1294     }
1295     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1296   }
1297   PetscCall(PetscViewerFlush(viewer));
1298   PetscFunctionReturn(PETSC_SUCCESS);
1299 }
1300 
1301 #include <petscdraw.h>
1302 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1303 {
1304   Mat               A = (Mat)Aa;
1305   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1306   PetscInt          i, j, m = A->rmap->n, shift;
1307   int               color;
1308   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1309   PetscViewer       viewer;
1310   PetscViewerFormat format;
1311 
1312   PetscFunctionBegin;
1313   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1314   PetscCall(PetscViewerGetFormat(viewer, &format));
1315   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1316 
1317   /* loop over matrix elements drawing boxes */
1318 
1319   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1320     PetscDrawCollectiveBegin(draw);
1321     /* Blue for negative, Cyan for zero and  Red for positive */
1322     color = PETSC_DRAW_BLUE;
1323     for (i = 0; i < m; i++) {
1324       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1325       y_l   = m - i - 1.0;
1326       y_r   = y_l + 1.0;
1327       for (j = 0; j < a->rlen[i]; j++) {
1328         x_l = a->colidx[shift + a->sliceheight * j];
1329         x_r = x_l + 1.0;
1330         if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1331         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1332       }
1333     }
1334     color = PETSC_DRAW_CYAN;
1335     for (i = 0; i < m; i++) {
1336       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1337       y_l   = m - i - 1.0;
1338       y_r   = y_l + 1.0;
1339       for (j = 0; j < a->rlen[i]; j++) {
1340         x_l = a->colidx[shift + a->sliceheight * j];
1341         x_r = x_l + 1.0;
1342         if (a->val[shift + a->sliceheight * j] != 0.) continue;
1343         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1344       }
1345     }
1346     color = PETSC_DRAW_RED;
1347     for (i = 0; i < m; i++) {
1348       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1349       y_l   = m - i - 1.0;
1350       y_r   = y_l + 1.0;
1351       for (j = 0; j < a->rlen[i]; j++) {
1352         x_l = a->colidx[shift + a->sliceheight * j];
1353         x_r = x_l + 1.0;
1354         if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1355         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1356       }
1357     }
1358     PetscDrawCollectiveEnd(draw);
1359   } else {
1360     /* use contour shading to indicate magnitude of values */
1361     /* first determine max of all nonzero values */
1362     PetscReal minv = 0.0, maxv = 0.0;
1363     PetscInt  count = 0;
1364     PetscDraw popup;
1365     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1366       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1367     }
1368     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1369     PetscCall(PetscDrawGetPopup(draw, &popup));
1370     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1371 
1372     PetscDrawCollectiveBegin(draw);
1373     for (i = 0; i < m; i++) {
1374       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1375       y_l   = m - i - 1.0;
1376       y_r   = y_l + 1.0;
1377       for (j = 0; j < a->rlen[i]; j++) {
1378         x_l   = a->colidx[shift + a->sliceheight * j];
1379         x_r   = x_l + 1.0;
1380         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1381         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1382         count++;
1383       }
1384     }
1385     PetscDrawCollectiveEnd(draw);
1386   }
1387   PetscFunctionReturn(PETSC_SUCCESS);
1388 }
1389 
1390 #include <petscdraw.h>
1391 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1392 {
1393   PetscDraw draw;
1394   PetscReal xr, yr, xl, yl, h, w;
1395   PetscBool isnull;
1396 
1397   PetscFunctionBegin;
1398   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1399   PetscCall(PetscDrawIsNull(draw, &isnull));
1400   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1401 
1402   xr = A->cmap->n;
1403   yr = A->rmap->n;
1404   h  = yr / 10.0;
1405   w  = xr / 10.0;
1406   xr += w;
1407   yr += h;
1408   xl = -w;
1409   yl = -h;
1410   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1411   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1412   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1413   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1414   PetscCall(PetscDrawSave(draw));
1415   PetscFunctionReturn(PETSC_SUCCESS);
1416 }
1417 
1418 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1419 {
1420   PetscBool iascii, isbinary, isdraw;
1421 
1422   PetscFunctionBegin;
1423   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1424   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1425   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1426   if (iascii) {
1427     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1428   } else if (isbinary) {
1429     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1430   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1431   PetscFunctionReturn(PETSC_SUCCESS);
1432 }
1433 
1434 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1435 {
1436   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1437   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1438   MatScalar   *vp;
1439 #if defined(PETSC_HAVE_CUDA)
1440   PetscInt totalchunks = 0;
1441 #endif
1442 
1443   PetscFunctionBegin;
1444   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1445   /* To do: compress out the unused elements */
1446   PetscCall(MatMarkDiagonal_SeqSELL(A));
1447   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1448   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1449   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1450   a->nonzerorowcnt = 0;
1451   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1452   for (i = 0; i < a->totalslices; ++i) {
1453     shift = a->sliidx[i];                                                   /* starting index of the slice */
1454     cp    = a->colidx + shift;                                              /* pointer to the column indices of the slice */
1455     vp    = a->val + shift;                                                 /* pointer to the nonzero values of the slice */
1456     for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1457       row  = a->sliceheight * i + row_in_slice;
1458       nrow = a->rlen[row]; /* number of nonzeros in row */
1459       /*
1460         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1461         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1462       */
1463       lastcol = 0;
1464       if (nrow > 0) { /* nonempty row */
1465         a->nonzerorowcnt++;
1466         lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1467       } else if (!row_in_slice) {                                 /* first row of the correct slice is empty */
1468         for (j = 1; j < a->sliceheight; j++) {
1469           if (a->rlen[a->sliceheight * i + j]) {
1470             lastcol = cp[j];
1471             break;
1472           }
1473         }
1474       } else {
1475         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1476       }
1477 
1478       for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1479         cp[a->sliceheight * k + row_in_slice] = lastcol;
1480         vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1481       }
1482     }
1483   }
1484 
1485   A->info.mallocs += a->reallocs;
1486   a->reallocs = 0;
1487 
1488   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1489 #if defined(PETSC_HAVE_CUDA)
1490   if (!a->chunksize && a->totalslices) {
1491     a->chunksize = 64;
1492     while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2;
1493     totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize;
1494   }
1495   if (totalchunks != a->totalchunks) {
1496     PetscCall(PetscFree(a->chunk_slice_map));
1497     PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map));
1498     a->totalchunks = totalchunks;
1499   }
1500   j = 0;
1501   for (i = 0; i < totalchunks; i++) {
1502     while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++;
1503     a->chunk_slice_map[i] = j;
1504   }
1505 #endif
1506   PetscFunctionReturn(PETSC_SUCCESS);
1507 }
1508 
1509 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1510 {
1511   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1512 
1513   PetscFunctionBegin;
1514   info->block_size   = 1.0;
1515   info->nz_allocated = a->maxallocmat;
1516   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1517   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1518   info->assemblies   = A->num_ass;
1519   info->mallocs      = A->info.mallocs;
1520   info->memory       = 0; /* REVIEW ME */
1521   if (A->factortype) {
1522     info->fill_ratio_given  = A->info.fill_ratio_given;
1523     info->fill_ratio_needed = A->info.fill_ratio_needed;
1524     info->factor_mallocs    = A->info.factor_mallocs;
1525   } else {
1526     info->fill_ratio_given  = 0;
1527     info->fill_ratio_needed = 0;
1528     info->factor_mallocs    = 0;
1529   }
1530   PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532 
1533 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1534 {
1535   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1536   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1537   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1538   MatScalar   *vp, value;
1539 #if defined(PETSC_HAVE_CUDA)
1540   PetscBool inserted = PETSC_FALSE;
1541   PetscInt  mul      = DEVICE_MEM_ALIGN / a->sliceheight;
1542 #endif
1543 
1544   PetscFunctionBegin;
1545   for (k = 0; k < m; k++) { /* loop over added rows */
1546     row = im[k];
1547     if (row < 0) continue;
1548     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1549     shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1550     cp    = a->colidx + shift;                                      /* pointer to the row */
1551     vp    = a->val + shift;                                         /* pointer to the row */
1552     nrow  = a->rlen[row];
1553     low   = 0;
1554     high  = nrow;
1555 
1556     for (l = 0; l < n; l++) { /* loop over added columns */
1557       col = in[l];
1558       if (col < 0) continue;
1559       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1560       if (a->roworiented) {
1561         value = v[l + k * n];
1562       } else {
1563         value = v[k + l * m];
1564       }
1565       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1566 
1567       /* search in this row for the specified column, i indicates the column to be set */
1568       if (col <= lastcol) low = 0;
1569       else high = nrow;
1570       lastcol = col;
1571       while (high - low > 5) {
1572         t = (low + high) / 2;
1573         if (*(cp + a->sliceheight * t) > col) high = t;
1574         else low = t;
1575       }
1576       for (i = low; i < high; i++) {
1577         if (*(cp + a->sliceheight * i) > col) break;
1578         if (*(cp + a->sliceheight * i) == col) {
1579           if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1580           else *(vp + a->sliceheight * i) = value;
1581 #if defined(PETSC_HAVE_CUDA)
1582           inserted = PETSC_TRUE;
1583 #endif
1584           low = i + 1;
1585           goto noinsert;
1586         }
1587       }
1588       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1589       if (nonew == 1) goto noinsert;
1590       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1591 #if defined(PETSC_HAVE_CUDA)
1592       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1593 #else
1594       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1595       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1596 #endif
1597       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1598       for (ii = nrow - 1; ii >= i; ii--) {
1599         *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1600         *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1601       }
1602       a->rlen[row]++;
1603       *(cp + a->sliceheight * i) = col;
1604       *(vp + a->sliceheight * i) = value;
1605       a->nz++;
1606       A->nonzerostate++;
1607 #if defined(PETSC_HAVE_CUDA)
1608       inserted = PETSC_TRUE;
1609 #endif
1610       low = i + 1;
1611       high++;
1612       nrow++;
1613     noinsert:;
1614     }
1615     a->rlen[row] = nrow;
1616   }
1617 #if defined(PETSC_HAVE_CUDA)
1618   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1619 #endif
1620   PetscFunctionReturn(PETSC_SUCCESS);
1621 }
1622 
1623 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1624 {
1625   PetscFunctionBegin;
1626   /* If the two matrices have the same copy implementation, use fast copy. */
1627   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1628     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1629     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1630 
1631     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1632     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1633   } else {
1634     PetscCall(MatCopy_Basic(A, B, str));
1635   }
1636   PetscFunctionReturn(PETSC_SUCCESS);
1637 }
1638 
1639 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1640 {
1641   PetscFunctionBegin;
1642   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1643   PetscFunctionReturn(PETSC_SUCCESS);
1644 }
1645 
1646 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1647 {
1648   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1649 
1650   PetscFunctionBegin;
1651   *array = a->val;
1652   PetscFunctionReturn(PETSC_SUCCESS);
1653 }
1654 
1655 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1656 {
1657   PetscFunctionBegin;
1658   PetscFunctionReturn(PETSC_SUCCESS);
1659 }
1660 
1661 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1662 {
1663   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1664   PetscInt     i;
1665   MatScalar   *aval = a->val;
1666 
1667   PetscFunctionBegin;
1668   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1669 #if defined(PETSC_HAVE_CUDA)
1670   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1671 #endif
1672   PetscFunctionReturn(PETSC_SUCCESS);
1673 }
1674 
1675 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1676 {
1677   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1678   PetscInt     i;
1679   MatScalar   *aval = a->val;
1680 
1681   PetscFunctionBegin;
1682   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1683   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1684 #if defined(PETSC_HAVE_CUDA)
1685   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1686 #endif
1687   PetscFunctionReturn(PETSC_SUCCESS);
1688 }
1689 
1690 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1691 {
1692   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1693   MatScalar   *aval   = a->val;
1694   PetscScalar  oalpha = alpha;
1695   PetscBLASInt one    = 1, size;
1696 
1697   PetscFunctionBegin;
1698   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1699   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1700   PetscCall(PetscLogFlops(a->nz));
1701   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1702 #if defined(PETSC_HAVE_CUDA)
1703   if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1704 #endif
1705   PetscFunctionReturn(PETSC_SUCCESS);
1706 }
1707 
1708 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1709 {
1710   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1711 
1712   PetscFunctionBegin;
1713   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1714   PetscCall(MatShift_Basic(Y, a));
1715   PetscFunctionReturn(PETSC_SUCCESS);
1716 }
1717 
1718 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1719 {
1720   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1721   PetscScalar       *x, sum, *t;
1722   const MatScalar   *idiag = NULL, *mdiag;
1723   const PetscScalar *b, *xb;
1724   PetscInt           n, m = A->rmap->n, i, j, shift;
1725   const PetscInt    *diag;
1726 
1727   PetscFunctionBegin;
1728   its = its * lits;
1729 
1730   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1731   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1732   a->fshift = fshift;
1733   a->omega  = omega;
1734 
1735   diag  = a->diag;
1736   t     = a->ssor_work;
1737   idiag = a->idiag;
1738   mdiag = a->mdiag;
1739 
1740   PetscCall(VecGetArray(xx, &x));
1741   PetscCall(VecGetArrayRead(bb, &b));
1742   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1743   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1744   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1745   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1746 
1747   if (flag & SOR_ZERO_INITIAL_GUESS) {
1748     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1749       for (i = 0; i < m; i++) {
1750         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1751         sum   = b[i];
1752         n     = (diag[i] - shift) / a->sliceheight;
1753         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1754         t[i] = sum;
1755         x[i] = sum * idiag[i];
1756       }
1757       xb = t;
1758       PetscCall(PetscLogFlops(a->nz));
1759     } else xb = b;
1760     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1761       for (i = m - 1; i >= 0; i--) {
1762         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1763         sum   = xb[i];
1764         n     = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1765         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1766         if (xb == b) {
1767           x[i] = sum * idiag[i];
1768         } else {
1769           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1770         }
1771       }
1772       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1773     }
1774     its--;
1775   }
1776   while (its--) {
1777     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1778       for (i = 0; i < m; i++) {
1779         /* lower */
1780         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1781         sum   = b[i];
1782         n     = (diag[i] - shift) / a->sliceheight;
1783         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1784         t[i] = sum; /* save application of the lower-triangular part */
1785         /* upper */
1786         n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1787         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1788         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1789       }
1790       xb = t;
1791       PetscCall(PetscLogFlops(2.0 * a->nz));
1792     } else xb = b;
1793     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1794       for (i = m - 1; i >= 0; i--) {
1795         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1796         sum   = xb[i];
1797         if (xb == b) {
1798           /* whole matrix (no checkpointing available) */
1799           n = a->rlen[i];
1800           for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1801           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1802         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1803           n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1804           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1805           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1806         }
1807       }
1808       if (xb == b) {
1809         PetscCall(PetscLogFlops(2.0 * a->nz));
1810       } else {
1811         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1812       }
1813     }
1814   }
1815   PetscCall(VecRestoreArray(xx, &x));
1816   PetscCall(VecRestoreArrayRead(bb, &b));
1817   PetscFunctionReturn(PETSC_SUCCESS);
1818 }
1819 
1820 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1821                                        MatGetRow_SeqSELL,
1822                                        MatRestoreRow_SeqSELL,
1823                                        MatMult_SeqSELL,
1824                                        /* 4*/ MatMultAdd_SeqSELL,
1825                                        MatMultTranspose_SeqSELL,
1826                                        MatMultTransposeAdd_SeqSELL,
1827                                        NULL,
1828                                        NULL,
1829                                        NULL,
1830                                        /* 10*/ NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        MatSOR_SeqSELL,
1834                                        NULL,
1835                                        /* 15*/ MatGetInfo_SeqSELL,
1836                                        MatEqual_SeqSELL,
1837                                        MatGetDiagonal_SeqSELL,
1838                                        MatDiagonalScale_SeqSELL,
1839                                        NULL,
1840                                        /* 20*/ NULL,
1841                                        MatAssemblyEnd_SeqSELL,
1842                                        MatSetOption_SeqSELL,
1843                                        MatZeroEntries_SeqSELL,
1844                                        /* 24*/ NULL,
1845                                        NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        NULL,
1849                                        /* 29*/ MatSetUp_SeqSELL,
1850                                        NULL,
1851                                        NULL,
1852                                        NULL,
1853                                        NULL,
1854                                        /* 34*/ MatDuplicate_SeqSELL,
1855                                        NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        NULL,
1859                                        /* 39*/ NULL,
1860                                        NULL,
1861                                        NULL,
1862                                        MatGetValues_SeqSELL,
1863                                        MatCopy_SeqSELL,
1864                                        /* 44*/ NULL,
1865                                        MatScale_SeqSELL,
1866                                        MatShift_SeqSELL,
1867                                        NULL,
1868                                        NULL,
1869                                        /* 49*/ NULL,
1870                                        NULL,
1871                                        NULL,
1872                                        NULL,
1873                                        NULL,
1874                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1875                                        NULL,
1876                                        NULL,
1877                                        NULL,
1878                                        NULL,
1879                                        /* 59*/ NULL,
1880                                        MatDestroy_SeqSELL,
1881                                        MatView_SeqSELL,
1882                                        NULL,
1883                                        NULL,
1884                                        /* 64*/ NULL,
1885                                        NULL,
1886                                        NULL,
1887                                        NULL,
1888                                        NULL,
1889                                        /* 69*/ NULL,
1890                                        NULL,
1891                                        NULL,
1892                                        NULL,
1893                                        NULL,
1894                                        /* 74*/ NULL,
1895                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1896                                        NULL,
1897                                        NULL,
1898                                        NULL,
1899                                        /* 79*/ NULL,
1900                                        NULL,
1901                                        NULL,
1902                                        NULL,
1903                                        NULL,
1904                                        /* 84*/ NULL,
1905                                        NULL,
1906                                        NULL,
1907                                        NULL,
1908                                        NULL,
1909                                        /* 89*/ NULL,
1910                                        NULL,
1911                                        NULL,
1912                                        NULL,
1913                                        NULL,
1914                                        /* 94*/ NULL,
1915                                        NULL,
1916                                        NULL,
1917                                        NULL,
1918                                        NULL,
1919                                        /* 99*/ NULL,
1920                                        NULL,
1921                                        NULL,
1922                                        MatConjugate_SeqSELL,
1923                                        NULL,
1924                                        /*104*/ NULL,
1925                                        NULL,
1926                                        NULL,
1927                                        NULL,
1928                                        NULL,
1929                                        /*109*/ NULL,
1930                                        NULL,
1931                                        NULL,
1932                                        NULL,
1933                                        MatMissingDiagonal_SeqSELL,
1934                                        /*114*/ NULL,
1935                                        NULL,
1936                                        NULL,
1937                                        NULL,
1938                                        NULL,
1939                                        /*119*/ NULL,
1940                                        NULL,
1941                                        NULL,
1942                                        NULL,
1943                                        NULL,
1944                                        /*124*/ NULL,
1945                                        NULL,
1946                                        NULL,
1947                                        NULL,
1948                                        NULL,
1949                                        /*129*/ NULL,
1950                                        NULL,
1951                                        NULL,
1952                                        NULL,
1953                                        NULL,
1954                                        /*134*/ NULL,
1955                                        NULL,
1956                                        NULL,
1957                                        NULL,
1958                                        NULL,
1959                                        /*139*/ NULL,
1960                                        NULL,
1961                                        NULL,
1962                                        MatFDColoringSetUp_SeqXAIJ,
1963                                        NULL,
1964                                        /*144*/ NULL,
1965                                        NULL,
1966                                        NULL,
1967                                        NULL,
1968                                        NULL,
1969                                        NULL,
1970                                        /*150*/ NULL,
1971                                        NULL};
1972 
1973 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1974 {
1975   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1976 
1977   PetscFunctionBegin;
1978   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1979 
1980   /* allocate space for values if not already there */
1981   if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1982 
1983   /* copy values over */
1984   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1985   PetscFunctionReturn(PETSC_SUCCESS);
1986 }
1987 
1988 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1989 {
1990   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1991 
1992   PetscFunctionBegin;
1993   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1994   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1995   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1996   PetscFunctionReturn(PETSC_SUCCESS);
1997 }
1998 
1999 PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
2000 {
2001   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2002 
2003   PetscFunctionBegin;
2004   if (a->totalslices && a->sliidx[a->totalslices]) {
2005     *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
2006   } else {
2007     *ratio = 0.0;
2008   }
2009   PetscFunctionReturn(PETSC_SUCCESS);
2010 }
2011 
2012 PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
2013 {
2014   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2015   PetscInt     i, current_slicewidth;
2016 
2017   PetscFunctionBegin;
2018   *slicewidth = 0;
2019   for (i = 0; i < a->totalslices; i++) {
2020     current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
2021     if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
2022   }
2023   PetscFunctionReturn(PETSC_SUCCESS);
2024 }
2025 
2026 PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
2027 {
2028   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2029 
2030   PetscFunctionBegin;
2031   *slicewidth = 0;
2032   if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; }
2033   PetscFunctionReturn(PETSC_SUCCESS);
2034 }
2035 
2036 PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance)
2037 {
2038   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2039   PetscReal    mean;
2040   PetscInt     i, totalslices = a->totalslices, *sliidx = a->sliidx;
2041 
2042   PetscFunctionBegin;
2043   *variance = 0;
2044   if (totalslices) {
2045     mean = (PetscReal)sliidx[totalslices] / totalslices;
2046     for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; }
2047   }
2048   PetscFunctionReturn(PETSC_SUCCESS);
2049 }
2050 
2051 PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2052 {
2053   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2054 
2055   PetscFunctionBegin;
2056   if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2057   PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2058   a->sliceheight = sliceheight;
2059 #if defined(PETSC_HAVE_CUDA)
2060   PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight);
2061 #endif
2062   PetscFunctionReturn(PETSC_SUCCESS);
2063 }
2064 
2065 /*@C
2066   MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
2067 
2068   Not Collective
2069 
2070   Input Parameters:
2071 + A     - a `MATSEQSELL` matrix
2072 - array - pointer to the data
2073 
2074   Level: intermediate
2075 
2076 .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
2077 @*/
2078 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
2079 {
2080   PetscFunctionBegin;
2081   PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
2082   PetscFunctionReturn(PETSC_SUCCESS);
2083 }
2084 
2085 /*@C
2086   MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2087 
2088   Not Collective
2089 
2090   Input Parameter:
2091 . A - a MATSEQSELL matrix
2092 
2093   Output Parameter:
2094 . ratio - ratio of number of padded zeros to number of allocated elements
2095 
2096   Level: intermediate
2097 
2098 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2099 @*/
2100 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2101 {
2102   PetscFunctionBegin;
2103   PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2104   PetscFunctionReturn(PETSC_SUCCESS);
2105 }
2106 
2107 /*@C
2108   MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2109 
2110   Not Collective
2111 
2112   Input Parameter:
2113 . A - a MATSEQSELL matrix
2114 
2115   Output Parameter:
2116 . slicewidth - maximum slice width
2117 
2118   Level: intermediate
2119 
2120 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2121 @*/
2122 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2123 {
2124   PetscFunctionBegin;
2125   PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2126   PetscFunctionReturn(PETSC_SUCCESS);
2127 }
2128 
2129 /*@C
2130   MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2131 
2132   Not Collective
2133 
2134   Input Parameter:
2135 . A - a MATSEQSELL matrix
2136 
2137   Output Parameter:
2138 . slicewidth - average slice width
2139 
2140   Level: intermediate
2141 
2142 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()`
2143 @*/
2144 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2145 {
2146   PetscFunctionBegin;
2147   PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2148   PetscFunctionReturn(PETSC_SUCCESS);
2149 }
2150 
2151 /*@C
2152   MatSeqSELLSetSliceHeight - sets the slice height.
2153 
2154   Not Collective
2155 
2156   Input Parameters:
2157 + A           - a MATSEQSELL matrix
2158 - sliceheight - slice height
2159 
2160   Notes:
2161   You cannot change the slice height once it have been set.
2162 
2163   The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2164 
2165   Level: intermediate
2166 
2167 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()`
2168 @*/
2169 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2170 {
2171   PetscFunctionBegin;
2172   PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2173   PetscFunctionReturn(PETSC_SUCCESS);
2174 }
2175 
2176 /*@C
2177   MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2178 
2179   Not Collective
2180 
2181   Input Parameter:
2182 . A - a MATSEQSELL matrix
2183 
2184   Output Parameter:
2185 . variance - variance of the slice size
2186 
2187   Level: intermediate
2188 
2189 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()`
2190 @*/
2191 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2192 {
2193   PetscFunctionBegin;
2194   PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2195   PetscFunctionReturn(PETSC_SUCCESS);
2196 }
2197 
2198 #if defined(PETSC_HAVE_CUDA)
2199 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2200 #endif
2201 
2202 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2203 {
2204   Mat_SeqSELL *b;
2205   PetscMPIInt  size;
2206 
2207   PetscFunctionBegin;
2208   PetscCall(PetscCitationsRegister(citation, &cited));
2209   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2210   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2211 
2212   PetscCall(PetscNew(&b));
2213 
2214   B->data   = (void *)b;
2215   B->ops[0] = MatOps_Values;
2216 
2217   b->row                = NULL;
2218   b->col                = NULL;
2219   b->icol               = NULL;
2220   b->reallocs           = 0;
2221   b->ignorezeroentries  = PETSC_FALSE;
2222   b->roworiented        = PETSC_TRUE;
2223   b->nonew              = 0;
2224   b->diag               = NULL;
2225   b->solve_work         = NULL;
2226   B->spptr              = NULL;
2227   b->saved_values       = NULL;
2228   b->idiag              = NULL;
2229   b->mdiag              = NULL;
2230   b->ssor_work          = NULL;
2231   b->omega              = 1.0;
2232   b->fshift             = 0.0;
2233   b->idiagvalid         = PETSC_FALSE;
2234   b->keepnonzeropattern = PETSC_FALSE;
2235   b->sliceheight        = 0;
2236 
2237   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2238   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2239   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2240   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2241   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2242   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2243   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
2244 #if defined(PETSC_HAVE_CUDA)
2245   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA));
2246 #endif
2247   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2248   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2249   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2250   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL));
2251   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2252 
2253   PetscObjectOptionsBegin((PetscObject)B);
2254   {
2255     PetscInt  newsh = -1;
2256     PetscBool flg;
2257 #if defined(PETSC_HAVE_CUDA)
2258     PetscInt chunksize = 0;
2259 #endif
2260 
2261     PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2262     if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); }
2263 #if defined(PETSC_HAVE_CUDA)
2264     PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg));
2265     if (flg) {
2266       PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize);
2267       b->chunksize = chunksize;
2268     }
2269 #endif
2270   }
2271   PetscOptionsEnd();
2272   PetscFunctionReturn(PETSC_SUCCESS);
2273 }
2274 
2275 /*
2276  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2277  */
2278 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2279 {
2280   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2281   PetscInt     i, m                           = A->rmap->n;
2282   PetscInt     totalslices = a->totalslices;
2283 
2284   PetscFunctionBegin;
2285   C->factortype = A->factortype;
2286   c->row        = NULL;
2287   c->col        = NULL;
2288   c->icol       = NULL;
2289   c->reallocs   = 0;
2290   C->assembled  = PETSC_TRUE;
2291 
2292   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2293   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2294 
2295   PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen));
2296   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2297 
2298   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2299   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2300 
2301   /* allocate the matrix space */
2302   if (mallocmatspace) {
2303     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2304 
2305     c->singlemalloc = PETSC_TRUE;
2306 
2307     if (m > 0) {
2308       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2309       if (cpvalues == MAT_COPY_VALUES) {
2310         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2311       } else {
2312         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2313       }
2314     }
2315   }
2316 
2317   c->ignorezeroentries = a->ignorezeroentries;
2318   c->roworiented       = a->roworiented;
2319   c->nonew             = a->nonew;
2320   if (a->diag) {
2321     PetscCall(PetscMalloc1(m, &c->diag));
2322     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2323   } else c->diag = NULL;
2324 
2325   c->solve_work         = NULL;
2326   c->saved_values       = NULL;
2327   c->idiag              = NULL;
2328   c->ssor_work          = NULL;
2329   c->keepnonzeropattern = a->keepnonzeropattern;
2330   c->free_val           = PETSC_TRUE;
2331   c->free_colidx        = PETSC_TRUE;
2332 
2333   c->maxallocmat  = a->maxallocmat;
2334   c->maxallocrow  = a->maxallocrow;
2335   c->rlenmax      = a->rlenmax;
2336   c->nz           = a->nz;
2337   C->preallocated = PETSC_TRUE;
2338 
2339   c->nonzerorowcnt = a->nonzerorowcnt;
2340   C->nonzerostate  = A->nonzerostate;
2341 
2342   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2343   PetscFunctionReturn(PETSC_SUCCESS);
2344 }
2345 
2346 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2347 {
2348   PetscFunctionBegin;
2349   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2350   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2351   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2352   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2353   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2354   PetscFunctionReturn(PETSC_SUCCESS);
2355 }
2356 
2357 /*MC
2358    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2359    based on the sliced Ellpack format
2360 
2361    Options Database Key:
2362 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2363 
2364    Level: beginner
2365 
2366 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2367 M*/
2368 
2369 /*MC
2370    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2371 
2372    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2373    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2374   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2375   for communicators controlling multiple processes.  It is recommended that you call both of
2376   the above preallocation routines for simplicity.
2377 
2378    Options Database Key:
2379 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2380 
2381   Level: beginner
2382 
2383   Notes:
2384    This format is only supported for real scalars, double precision, and 32-bit indices (the defaults).
2385 
2386    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2387    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2388 
2389   Developer Notes:
2390    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2391 
2392    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2393 .vb
2394                             (2 0  3 4)
2395    Consider the matrix A =  (5 0  6 0)
2396                             (0 0  7 8)
2397                             (0 0  9 9)
2398 
2399    symbolically the Ellpack format can be written as
2400 
2401         (2 3 4 |)           (0 2 3 |)
2402    v =  (5 6 0 |)  colidx = (0 2 2 |)
2403         --------            ---------
2404         (7 8 |)             (2 3 |)
2405         (9 9 |)             (2 3 |)
2406 
2407     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2408     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2409     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2410 
2411     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2412 
2413 .ve
2414 
2415       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2416 
2417  References:
2418 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2419    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2420 
2421 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2422 M*/
2423 
2424 /*@C
2425   MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2426 
2427   Collective
2428 
2429   Input Parameters:
2430 + comm    - MPI communicator, set to `PETSC_COMM_SELF`
2431 . m       - number of rows
2432 . n       - number of columns
2433 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2434 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2435 
2436   Output Parameter:
2437 . A - the matrix
2438 
2439   Level: intermediate
2440 
2441   Notes:
2442   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2443   MatXXXXSetPreallocation() paradigm instead of this routine directly.
2444   [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2445 
2446   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2447   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2448   allocation.
2449 
2450 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL`
2451  @*/
2452 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2453 {
2454   PetscFunctionBegin;
2455   PetscCall(MatCreate(comm, A));
2456   PetscCall(MatSetSizes(*A, m, n, m, n));
2457   PetscCall(MatSetType(*A, MATSEQSELL));
2458   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2459   PetscFunctionReturn(PETSC_SUCCESS);
2460 }
2461 
2462 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2463 {
2464   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2465   PetscInt     totalslices = a->totalslices;
2466 
2467   PetscFunctionBegin;
2468   /* If the  matrix dimensions are not equal,or no of nonzeros */
2469   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2470     *flg = PETSC_FALSE;
2471     PetscFunctionReturn(PETSC_SUCCESS);
2472   }
2473   /* if the a->colidx are the same */
2474   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2475   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2476   /* if a->val are the same */
2477   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2478   PetscFunctionReturn(PETSC_SUCCESS);
2479 }
2480 
2481 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2482 {
2483   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2484 
2485   PetscFunctionBegin;
2486   a->idiagvalid = PETSC_FALSE;
2487   PetscFunctionReturn(PETSC_SUCCESS);
2488 }
2489 
2490 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2491 {
2492 #if defined(PETSC_USE_COMPLEX)
2493   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2494   PetscInt     i;
2495   PetscScalar *val = a->val;
2496 
2497   PetscFunctionBegin;
2498   for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); }
2499   #if defined(PETSC_HAVE_CUDA)
2500   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2501   #endif
2502 #else
2503   PetscFunctionBegin;
2504 #endif
2505   PetscFunctionReturn(PETSC_SUCCESS);
2506 }
2507