xref: /petsc/src/mat/impls/sell/seq/sell.c (revision bcee047adeeb73090d7e36cc71e39fc287cdbb97)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited      = PETSC_FALSE;
10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14                                " year = 2018\n"
15                                "}\n";
16 
17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18 
19   #include <immintrin.h>
20 
21   #if !defined(_MM_SCALE_8)
22     #define _MM_SCALE_8 8
23   #endif
24 
25   #if defined(__AVX512F__)
26     /* these do not work
27    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29   */
30     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
33       vec_vals = _mm512_loadu_pd(aval); \
34       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36   #elif defined(__AVX2__) && defined(__FMA__)
37     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38       vec_vals = _mm256_loadu_pd(aval); \
39       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42   #endif
43 #endif /* PETSC_HAVE_IMMINTRIN_H */
44 
45 /*@C
46  MatSeqSELLSetPreallocation - For good matrix assembly performance
47  the user should preallocate the matrix storage by setting the parameter `nz`
48  (or the array `nnz`).
49 
50  Collective
51 
52  Input Parameters:
53 +  B - The `MATSEQSELL` matrix
54 .  rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
55 -  rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
56 
57  Level: intermediate
58 
59  Notes:
60  Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
61  Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
62  allocation.
63 
64  You can call `MatGetInfo()` to get information on how effective the preallocation was;
65  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
66  You can also run with the option `-info` and look for messages with the string
67  malloc in them to see if additional memory allocation was needed.
68 
69  Developer Note:
70  Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
71  entries or columns indices.
72 
73  The maximum number of nonzeos in any row should be as accurate as possible.
74  If it is underestimated, you will get bad performance due to reallocation
75  (`MatSeqXSELLReallocateSELL()`).
76 
77 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
78  @*/
79 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
80 {
81   PetscFunctionBegin;
82   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
83   PetscValidType(B, 1);
84   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
85   PetscFunctionReturn(PETSC_SUCCESS);
86 }
87 
88 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
89 {
90   Mat_SeqSELL *b;
91   PetscInt     i, j, totalslices;
92 #if defined(PETSC_HAVE_CUDA)
93   PetscInt rlenmax = 0;
94 #endif
95   PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
96 
97   PetscFunctionBegin;
98   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
99   if (maxallocrow == MAT_SKIP_ALLOCATION) {
100     skipallocation = PETSC_TRUE;
101     maxallocrow    = 0;
102   }
103 
104   PetscCall(PetscLayoutSetUp(B->rmap));
105   PetscCall(PetscLayoutSetUp(B->cmap));
106 
107   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
108   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
109   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
110   if (rlen) {
111     for (i = 0; i < B->rmap->n; i++) {
112       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
113       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
114     }
115   }
116 
117   B->preallocated = PETSC_TRUE;
118 
119   b = (Mat_SeqSELL *)B->data;
120 
121   if (!b->sliceheight) { /* not set yet */
122 #if defined(PETSC_HAVE_CUDA)
123     b->sliceheight = 16;
124 #else
125     b->sliceheight = 8;
126 #endif
127   }
128   totalslices    = PetscCeilInt(B->rmap->n, b->sliceheight);
129   b->totalslices = totalslices;
130   if (!skipallocation) {
131     if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
132 
133     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
134       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
135     }
136     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
137       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
138       else if (maxallocrow < 0) maxallocrow = 1;
139 #if defined(PETSC_HAVE_CUDA)
140       rlenmax = maxallocrow;
141       /* Pad the slice to DEVICE_MEM_ALIGN */
142       while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
143 #endif
144       for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
145     } else {
146 #if defined(PETSC_HAVE_CUDA)
147       PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
148 #endif
149       maxallocrow  = 0;
150       b->sliidx[0] = 0;
151       for (i = 1; i < totalslices; i++) {
152         b->sliidx[i] = 0;
153         for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); }
154 #if defined(PETSC_HAVE_CUDA)
155         rlenmax = PetscMax(b->sliidx[i], rlenmax);
156         /* Pad the slice to DEVICE_MEM_ALIGN */
157         b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
158 #endif
159         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
160         PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
161       }
162       /* last slice */
163       b->sliidx[totalslices] = 0;
164       for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
165 #if defined(PETSC_HAVE_CUDA)
166       rlenmax                = PetscMax(b->sliidx[i], rlenmax);
167       b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
168 #endif
169       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
170       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
171     }
172 
173     /* allocate space for val, colidx, rlen */
174     /* FIXME: should B's old memory be unlogged? */
175     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
176     /* FIXME: assuming an element of the bit array takes 8 bits */
177     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
178     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
179     PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
180 
181     b->singlemalloc = PETSC_TRUE;
182     b->free_val     = PETSC_TRUE;
183     b->free_colidx  = PETSC_TRUE;
184   } else {
185     b->free_val    = PETSC_FALSE;
186     b->free_colidx = PETSC_FALSE;
187   }
188 
189   b->nz          = 0;
190   b->maxallocrow = maxallocrow;
191 #if defined(PETSC_HAVE_CUDA)
192   b->rlenmax = rlenmax;
193 #else
194   b->rlenmax = maxallocrow;
195 #endif
196   b->maxallocmat      = b->sliidx[totalslices];
197   B->info.nz_unneeded = (double)b->maxallocmat;
198   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
199   PetscFunctionReturn(PETSC_SUCCESS);
200 }
201 
202 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
203 {
204   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
205   PetscInt     shift;
206 
207   PetscFunctionBegin;
208   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
209   if (nz) *nz = a->rlen[row];
210   shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
211   if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); }
212   if (idx) {
213     PetscInt j;
214     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
215     *idx = a->getrowcols;
216   }
217   if (v) {
218     PetscInt j;
219     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
220     *v = a->getrowvals;
221   }
222   PetscFunctionReturn(PETSC_SUCCESS);
223 }
224 
225 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
226 {
227   PetscFunctionBegin;
228   PetscFunctionReturn(PETSC_SUCCESS);
229 }
230 
231 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
232 {
233   Mat          B;
234   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
235   PetscInt     i;
236 
237   PetscFunctionBegin;
238   if (reuse == MAT_REUSE_MATRIX) {
239     B = *newmat;
240     PetscCall(MatZeroEntries(B));
241   } else {
242     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
243     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
244     PetscCall(MatSetType(B, MATSEQAIJ));
245     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
246   }
247 
248   for (i = 0; i < A->rmap->n; i++) {
249     PetscInt     nz = 0, *cols = NULL;
250     PetscScalar *vals = NULL;
251 
252     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
253     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
254     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
255   }
256 
257   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
258   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
259   B->rmap->bs = A->rmap->bs;
260 
261   if (reuse == MAT_INPLACE_MATRIX) {
262     PetscCall(MatHeaderReplace(A, &B));
263   } else {
264     *newmat = B;
265   }
266   PetscFunctionReturn(PETSC_SUCCESS);
267 }
268 
269 #include <../src/mat/impls/aij/seq/aij.h>
270 
271 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
272 {
273   Mat                B;
274   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
275   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
276   const PetscInt    *cols;
277   const PetscScalar *vals;
278 
279   PetscFunctionBegin;
280 
281   if (reuse == MAT_REUSE_MATRIX) {
282     B = *newmat;
283   } else {
284     if (PetscDefined(USE_DEBUG) || !a->ilen) {
285       PetscCall(PetscMalloc1(m, &rowlengths));
286       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
287     }
288     if (PetscDefined(USE_DEBUG) && a->ilen) {
289       PetscBool eq;
290       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
291       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
292       PetscCall(PetscFree(rowlengths));
293       rowlengths = a->ilen;
294     } else if (a->ilen) rowlengths = a->ilen;
295     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
296     PetscCall(MatSetSizes(B, m, n, m, n));
297     PetscCall(MatSetType(B, MATSEQSELL));
298     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
299     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
300   }
301 
302   for (row = 0; row < m; row++) {
303     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
304     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
305     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
306   }
307   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
308   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
309   B->rmap->bs = A->rmap->bs;
310 
311   if (reuse == MAT_INPLACE_MATRIX) {
312     PetscCall(MatHeaderReplace(A, &B));
313   } else {
314     *newmat = B;
315   }
316   PetscFunctionReturn(PETSC_SUCCESS);
317 }
318 
319 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
320 {
321   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
322   PetscScalar       *y;
323   const PetscScalar *x;
324   const MatScalar   *aval        = a->val;
325   PetscInt           totalslices = a->totalslices;
326   const PetscInt    *acolidx     = a->colidx;
327   PetscInt           i, j;
328 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
329   __m512d  vec_x, vec_y, vec_vals;
330   __m256i  vec_idx;
331   __mmask8 mask;
332   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
333   __m256i  vec_idx2, vec_idx3, vec_idx4;
334 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
335   __m128i   vec_idx;
336   __m256d   vec_x, vec_y, vec_y2, vec_vals;
337   MatScalar yval;
338   PetscInt  r, rows_left, row, nnz_in_row;
339 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
340   __m128d   vec_x_tmp;
341   __m256d   vec_x, vec_y, vec_y2, vec_vals;
342   MatScalar yval;
343   PetscInt  r, rows_left, row, nnz_in_row;
344 #else
345   PetscInt     k, sliceheight = a->sliceheight;
346   PetscScalar *sum;
347 #endif
348 
349 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
350   #pragma disjoint(*x, *y, *aval)
351 #endif
352 
353   PetscFunctionBegin;
354   PetscCall(VecGetArrayRead(xx, &x));
355   PetscCall(VecGetArray(yy, &y));
356 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
357   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
358   for (i = 0; i < totalslices; i++) { /* loop over slices */
359     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
360     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
361 
362     vec_y  = _mm512_setzero_pd();
363     vec_y2 = _mm512_setzero_pd();
364     vec_y3 = _mm512_setzero_pd();
365     vec_y4 = _mm512_setzero_pd();
366 
367     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
368     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
369     case 3:
370       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
371       acolidx += 8;
372       aval += 8;
373       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
374       acolidx += 8;
375       aval += 8;
376       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
377       acolidx += 8;
378       aval += 8;
379       j += 3;
380       break;
381     case 2:
382       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
383       acolidx += 8;
384       aval += 8;
385       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
386       acolidx += 8;
387       aval += 8;
388       j += 2;
389       break;
390     case 1:
391       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
392       acolidx += 8;
393       aval += 8;
394       j += 1;
395       break;
396     }
397   #pragma novector
398     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
399       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
400       acolidx += 8;
401       aval += 8;
402       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
403       acolidx += 8;
404       aval += 8;
405       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
406       acolidx += 8;
407       aval += 8;
408       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
409       acolidx += 8;
410       aval += 8;
411     }
412 
413     vec_y = _mm512_add_pd(vec_y, vec_y2);
414     vec_y = _mm512_add_pd(vec_y, vec_y3);
415     vec_y = _mm512_add_pd(vec_y, vec_y4);
416     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
417       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
418       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
419     } else {
420       _mm512_storeu_pd(&y[8 * i], vec_y);
421     }
422   }
423 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
424   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
425   for (i = 0; i < totalslices; i++) { /* loop over full slices */
426     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
427     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
428 
429     /* last slice may have padding rows. Don't use vectorization. */
430     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
431       rows_left = A->rmap->n - 8 * i;
432       for (r = 0; r < rows_left; ++r) {
433         yval       = (MatScalar)0;
434         row        = 8 * i + r;
435         nnz_in_row = a->rlen[row];
436         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
437         y[row] = yval;
438       }
439       break;
440     }
441 
442     vec_y  = _mm256_setzero_pd();
443     vec_y2 = _mm256_setzero_pd();
444 
445   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
446   #pragma novector
447   #pragma unroll(2)
448     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
449       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
450       aval += 4;
451       acolidx += 4;
452       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
453       aval += 4;
454       acolidx += 4;
455     }
456 
457     _mm256_storeu_pd(y + i * 8, vec_y);
458     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
459   }
460 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
461   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
462   for (i = 0; i < totalslices; i++) { /* loop over full slices */
463     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
464     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
465 
466     vec_y  = _mm256_setzero_pd();
467     vec_y2 = _mm256_setzero_pd();
468 
469     /* last slice may have padding rows. Don't use vectorization. */
470     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
471       rows_left = A->rmap->n - 8 * i;
472       for (r = 0; r < rows_left; ++r) {
473         yval       = (MatScalar)0;
474         row        = 8 * i + r;
475         nnz_in_row = a->rlen[row];
476         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
477         y[row] = yval;
478       }
479       break;
480     }
481 
482   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
483   #pragma novector
484   #pragma unroll(2)
485     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
486       vec_vals  = _mm256_loadu_pd(aval);
487       vec_x_tmp = _mm_setzero_pd();
488       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
489       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
490       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
491       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
492       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
493       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
494       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
495       aval += 4;
496 
497       vec_vals  = _mm256_loadu_pd(aval);
498       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
499       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
500       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
501       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
502       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
503       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
504       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
505       aval += 4;
506     }
507 
508     _mm256_storeu_pd(y + i * 8, vec_y);
509     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
510   }
511 #else
512   PetscCall(PetscMalloc1(sliceheight, &sum));
513   for (i = 0; i < totalslices; i++) { /* loop over slices */
514     for (j = 0; j < sliceheight; j++) {
515       sum[j] = 0.0;
516       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
517     }
518     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
519       for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
520     } else {
521       for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
522     }
523   }
524   PetscCall(PetscFree(sum));
525 #endif
526 
527   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
528   PetscCall(VecRestoreArrayRead(xx, &x));
529   PetscCall(VecRestoreArray(yy, &y));
530   PetscFunctionReturn(PETSC_SUCCESS);
531 }
532 
533 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
534 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
535 {
536   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
537   PetscScalar       *y, *z;
538   const PetscScalar *x;
539   const MatScalar   *aval        = a->val;
540   PetscInt           totalslices = a->totalslices;
541   const PetscInt    *acolidx     = a->colidx;
542   PetscInt           i, j;
543 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
544   __m512d  vec_x, vec_y, vec_vals;
545   __m256i  vec_idx;
546   __mmask8 mask = 0;
547   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
548   __m256i  vec_idx2, vec_idx3, vec_idx4;
549 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
550   __m128d   vec_x_tmp;
551   __m256d   vec_x, vec_y, vec_y2, vec_vals;
552   MatScalar yval;
553   PetscInt  r, row, nnz_in_row;
554 #else
555   PetscInt     k, sliceheight = a->sliceheight;
556   PetscScalar *sum;
557 #endif
558 
559 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
560   #pragma disjoint(*x, *y, *aval)
561 #endif
562 
563   PetscFunctionBegin;
564   if (!a->nz) {
565     PetscCall(VecCopy(yy, zz));
566     PetscFunctionReturn(PETSC_SUCCESS);
567   }
568   PetscCall(VecGetArrayRead(xx, &x));
569   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
570 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
571   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
572   for (i = 0; i < totalslices; i++) { /* loop over slices */
573     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
574     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
575 
576     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
577       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
578       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
579     } else {
580       vec_y = _mm512_loadu_pd(&y[8 * i]);
581     }
582     vec_y2 = _mm512_setzero_pd();
583     vec_y3 = _mm512_setzero_pd();
584     vec_y4 = _mm512_setzero_pd();
585 
586     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
587     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
588     case 3:
589       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
590       acolidx += 8;
591       aval += 8;
592       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
593       acolidx += 8;
594       aval += 8;
595       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
596       acolidx += 8;
597       aval += 8;
598       j += 3;
599       break;
600     case 2:
601       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
602       acolidx += 8;
603       aval += 8;
604       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
605       acolidx += 8;
606       aval += 8;
607       j += 2;
608       break;
609     case 1:
610       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
611       acolidx += 8;
612       aval += 8;
613       j += 1;
614       break;
615     }
616   #pragma novector
617     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
618       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
619       acolidx += 8;
620       aval += 8;
621       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
622       acolidx += 8;
623       aval += 8;
624       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
625       acolidx += 8;
626       aval += 8;
627       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
628       acolidx += 8;
629       aval += 8;
630     }
631 
632     vec_y = _mm512_add_pd(vec_y, vec_y2);
633     vec_y = _mm512_add_pd(vec_y, vec_y3);
634     vec_y = _mm512_add_pd(vec_y, vec_y4);
635     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
636       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
637     } else {
638       _mm512_storeu_pd(&z[8 * i], vec_y);
639     }
640   }
641 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
642   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
643   for (i = 0; i < totalslices; i++) { /* loop over full slices */
644     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
645     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
646 
647     /* last slice may have padding rows. Don't use vectorization. */
648     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
649       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
650         row        = 8 * i + r;
651         yval       = (MatScalar)0.0;
652         nnz_in_row = a->rlen[row];
653         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
654         z[row] = y[row] + yval;
655       }
656       break;
657     }
658 
659     vec_y  = _mm256_loadu_pd(y + 8 * i);
660     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
661 
662     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
663     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
664       vec_vals  = _mm256_loadu_pd(aval);
665       vec_x_tmp = _mm_setzero_pd();
666       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
667       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
668       vec_x     = _mm256_setzero_pd();
669       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
670       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
671       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
672       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
673       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
674       aval += 4;
675 
676       vec_vals  = _mm256_loadu_pd(aval);
677       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
678       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
679       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
680       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
681       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
682       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
683       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
684       aval += 4;
685     }
686 
687     _mm256_storeu_pd(z + i * 8, vec_y);
688     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
689   }
690 #else
691   PetscCall(PetscMalloc1(sliceheight, &sum));
692   for (i = 0; i < totalslices; i++) { /* loop over slices */
693     for (j = 0; j < sliceheight; j++) {
694       sum[j] = 0.0;
695       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
696     }
697     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
698       for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
699     } else {
700       for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
701     }
702   }
703   PetscCall(PetscFree(sum));
704 #endif
705 
706   PetscCall(PetscLogFlops(2.0 * a->nz));
707   PetscCall(VecRestoreArrayRead(xx, &x));
708   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
709   PetscFunctionReturn(PETSC_SUCCESS);
710 }
711 
712 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
713 {
714   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
715   PetscScalar       *y;
716   const PetscScalar *x;
717   const MatScalar   *aval    = a->val;
718   const PetscInt    *acolidx = a->colidx;
719   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
720 
721 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
722   #pragma disjoint(*x, *y, *aval)
723 #endif
724 
725   PetscFunctionBegin;
726   if (A->symmetric == PETSC_BOOL3_TRUE) {
727     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
728     PetscFunctionReturn(PETSC_SUCCESS);
729   }
730   if (zz != yy) PetscCall(VecCopy(zz, yy));
731 
732   if (a->nz) {
733     PetscCall(VecGetArrayRead(xx, &x));
734     PetscCall(VecGetArray(yy, &y));
735     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
736       if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
737         for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
738           row        = sliceheight * i + r;
739           nnz_in_row = a->rlen[row];
740           for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
741         }
742         break;
743       }
744       for (r = 0; r < sliceheight; ++r)
745         for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
746     }
747     PetscCall(PetscLogFlops(2.0 * a->nz));
748     PetscCall(VecRestoreArrayRead(xx, &x));
749     PetscCall(VecRestoreArray(yy, &y));
750   }
751   PetscFunctionReturn(PETSC_SUCCESS);
752 }
753 
754 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
755 {
756   PetscFunctionBegin;
757   if (A->symmetric == PETSC_BOOL3_TRUE) {
758     PetscCall(MatMult_SeqSELL(A, xx, yy));
759   } else {
760     PetscCall(VecSet(yy, 0.0));
761     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
762   }
763   PetscFunctionReturn(PETSC_SUCCESS);
764 }
765 
766 /*
767      Checks for missing diagonals
768 */
769 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
770 {
771   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
772   PetscInt    *diag, i;
773 
774   PetscFunctionBegin;
775   *missing = PETSC_FALSE;
776   if (A->rmap->n > 0 && !(a->colidx)) {
777     *missing = PETSC_TRUE;
778     if (d) *d = 0;
779     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
780   } else {
781     diag = a->diag;
782     for (i = 0; i < A->rmap->n; i++) {
783       if (diag[i] == -1) {
784         *missing = PETSC_TRUE;
785         if (d) *d = i;
786         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
787         break;
788       }
789     }
790   }
791   PetscFunctionReturn(PETSC_SUCCESS);
792 }
793 
794 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
795 {
796   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
797   PetscInt     i, j, m = A->rmap->n, shift;
798 
799   PetscFunctionBegin;
800   if (!a->diag) {
801     PetscCall(PetscMalloc1(m, &a->diag));
802     a->free_diag = PETSC_TRUE;
803   }
804   for (i = 0; i < m; i++) {                                          /* loop over rows */
805     shift      = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
806     a->diag[i] = -1;
807     for (j = 0; j < a->rlen[i]; j++) {
808       if (a->colidx[shift + a->sliceheight * j] == i) {
809         a->diag[i] = shift + a->sliceheight * j;
810         break;
811       }
812     }
813   }
814   PetscFunctionReturn(PETSC_SUCCESS);
815 }
816 
817 /*
818   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
819 */
820 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
821 {
822   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
823   PetscInt     i, *diag, m = A->rmap->n;
824   MatScalar   *val = a->val;
825   PetscScalar *idiag, *mdiag;
826 
827   PetscFunctionBegin;
828   if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
829   PetscCall(MatMarkDiagonal_SeqSELL(A));
830   diag = a->diag;
831   if (!a->idiag) {
832     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
833     val = a->val;
834   }
835   mdiag = a->mdiag;
836   idiag = a->idiag;
837 
838   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
839     for (i = 0; i < m; i++) {
840       mdiag[i] = val[diag[i]];
841       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
842         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
843         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
844         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
845         A->factorerror_zeropivot_value = 0.0;
846         A->factorerror_zeropivot_row   = i;
847       }
848       idiag[i] = 1.0 / val[diag[i]];
849     }
850     PetscCall(PetscLogFlops(m));
851   } else {
852     for (i = 0; i < m; i++) {
853       mdiag[i] = val[diag[i]];
854       idiag[i] = omega / (fshift + val[diag[i]]);
855     }
856     PetscCall(PetscLogFlops(2.0 * m));
857   }
858   a->idiagvalid = PETSC_TRUE;
859   PetscFunctionReturn(PETSC_SUCCESS);
860 }
861 
862 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
863 {
864   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
865 
866   PetscFunctionBegin;
867   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
868   PetscCall(MatSeqSELLInvalidateDiagonal(A));
869   PetscFunctionReturn(PETSC_SUCCESS);
870 }
871 
872 PetscErrorCode MatDestroy_SeqSELL(Mat A)
873 {
874   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
875 
876   PetscFunctionBegin;
877 #if defined(PETSC_USE_LOG)
878   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
879 #endif
880   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
881   PetscCall(ISDestroy(&a->row));
882   PetscCall(ISDestroy(&a->col));
883   PetscCall(PetscFree(a->diag));
884   PetscCall(PetscFree(a->rlen));
885   PetscCall(PetscFree(a->sliidx));
886   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
887   PetscCall(PetscFree(a->solve_work));
888   PetscCall(ISDestroy(&a->icol));
889   PetscCall(PetscFree(a->saved_values));
890   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
891   PetscCall(PetscFree(A->data));
892 #if defined(PETSC_HAVE_CUDA)
893   PetscCall(PetscFree(a->chunk_slice_map));
894 #endif
895 
896   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
897   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
898   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
899   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
900   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
901   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
902   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
903 #if defined(PETSC_HAVE_CUDA)
904   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL));
905 #endif
906   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
907   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
908   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
909   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL));
910   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
911   PetscFunctionReturn(PETSC_SUCCESS);
912 }
913 
914 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
915 {
916   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
917 
918   PetscFunctionBegin;
919   switch (op) {
920   case MAT_ROW_ORIENTED:
921     a->roworiented = flg;
922     break;
923   case MAT_KEEP_NONZERO_PATTERN:
924     a->keepnonzeropattern = flg;
925     break;
926   case MAT_NEW_NONZERO_LOCATIONS:
927     a->nonew = (flg ? 0 : 1);
928     break;
929   case MAT_NEW_NONZERO_LOCATION_ERR:
930     a->nonew = (flg ? -1 : 0);
931     break;
932   case MAT_NEW_NONZERO_ALLOCATION_ERR:
933     a->nonew = (flg ? -2 : 0);
934     break;
935   case MAT_UNUSED_NONZERO_LOCATION_ERR:
936     a->nounused = (flg ? -1 : 0);
937     break;
938   case MAT_FORCE_DIAGONAL_ENTRIES:
939   case MAT_IGNORE_OFF_PROC_ENTRIES:
940   case MAT_USE_HASH_TABLE:
941   case MAT_SORTED_FULL:
942     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
943     break;
944   case MAT_SPD:
945   case MAT_SYMMETRIC:
946   case MAT_STRUCTURALLY_SYMMETRIC:
947   case MAT_HERMITIAN:
948   case MAT_SYMMETRY_ETERNAL:
949   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
950   case MAT_SPD_ETERNAL:
951     /* These options are handled directly by MatSetOption() */
952     break;
953   default:
954     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
955   }
956   PetscFunctionReturn(PETSC_SUCCESS);
957 }
958 
959 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
960 {
961   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
962   PetscInt     i, j, n, shift;
963   PetscScalar *x, zero = 0.0;
964 
965   PetscFunctionBegin;
966   PetscCall(VecGetLocalSize(v, &n));
967   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
968 
969   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
970     PetscInt *diag = a->diag;
971     PetscCall(VecGetArray(v, &x));
972     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
973     PetscCall(VecRestoreArray(v, &x));
974     PetscFunctionReturn(PETSC_SUCCESS);
975   }
976 
977   PetscCall(VecSet(v, zero));
978   PetscCall(VecGetArray(v, &x));
979   for (i = 0; i < n; i++) {                                     /* loop over rows */
980     shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
981     x[i]  = 0;
982     for (j = 0; j < a->rlen[i]; j++) {
983       if (a->colidx[shift + a->sliceheight * j] == i) {
984         x[i] = a->val[shift + a->sliceheight * j];
985         break;
986       }
987     }
988   }
989   PetscCall(VecRestoreArray(v, &x));
990   PetscFunctionReturn(PETSC_SUCCESS);
991 }
992 
993 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
994 {
995   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
996   const PetscScalar *l, *r;
997   PetscInt           i, j, m, n, row;
998 
999   PetscFunctionBegin;
1000   if (ll) {
1001     /* The local size is used so that VecMPI can be passed to this routine
1002        by MatDiagonalScale_MPISELL */
1003     PetscCall(VecGetLocalSize(ll, &m));
1004     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1005     PetscCall(VecGetArrayRead(ll, &l));
1006     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1007       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1008         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1009           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
1010         }
1011       } else {
1012         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; }
1013       }
1014     }
1015     PetscCall(VecRestoreArrayRead(ll, &l));
1016     PetscCall(PetscLogFlops(a->nz));
1017   }
1018   if (rr) {
1019     PetscCall(VecGetLocalSize(rr, &n));
1020     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1021     PetscCall(VecGetArrayRead(rr, &r));
1022     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1023       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1024         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1025           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1026         }
1027       } else {
1028         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1029       }
1030     }
1031     PetscCall(VecRestoreArrayRead(rr, &r));
1032     PetscCall(PetscLogFlops(a->nz));
1033   }
1034   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1035 #if defined(PETSC_HAVE_CUDA)
1036   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1037 #endif
1038   PetscFunctionReturn(PETSC_SUCCESS);
1039 }
1040 
1041 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1042 {
1043   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1044   PetscInt    *cp, i, k, low, high, t, row, col, l;
1045   PetscInt     shift;
1046   MatScalar   *vp;
1047 
1048   PetscFunctionBegin;
1049   for (k = 0; k < m; k++) { /* loop over requested rows */
1050     row = im[k];
1051     if (row < 0) continue;
1052     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1053     shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1054     cp    = a->colidx + shift;                                        /* pointer to the row */
1055     vp    = a->val + shift;                                           /* pointer to the row */
1056     for (l = 0; l < n; l++) {                                         /* loop over requested columns */
1057       col = in[l];
1058       if (col < 0) continue;
1059       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1060       high = a->rlen[row];
1061       low  = 0; /* assume unsorted */
1062       while (high - low > 5) {
1063         t = (low + high) / 2;
1064         if (*(cp + a->sliceheight * t) > col) high = t;
1065         else low = t;
1066       }
1067       for (i = low; i < high; i++) {
1068         if (*(cp + a->sliceheight * i) > col) break;
1069         if (*(cp + a->sliceheight * i) == col) {
1070           *v++ = *(vp + a->sliceheight * i);
1071           goto finished;
1072         }
1073       }
1074       *v++ = 0.0;
1075     finished:;
1076     }
1077   }
1078   PetscFunctionReturn(PETSC_SUCCESS);
1079 }
1080 
1081 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1082 {
1083   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1084   PetscInt          i, j, m = A->rmap->n, shift;
1085   const char       *name;
1086   PetscViewerFormat format;
1087 
1088   PetscFunctionBegin;
1089   PetscCall(PetscViewerGetFormat(viewer, &format));
1090   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1091     PetscInt nofinalvalue = 0;
1092     /*
1093     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1094       nofinalvalue = 1;
1095     }
1096     */
1097     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1098     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1099     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1100 #if defined(PETSC_USE_COMPLEX)
1101     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1102 #else
1103     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1104 #endif
1105     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1106 
1107     for (i = 0; i < m; i++) {
1108       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1109       for (j = 0; j < a->rlen[i]; j++) {
1110 #if defined(PETSC_USE_COMPLEX)
1111         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1112 #else
1113         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1114 #endif
1115       }
1116     }
1117     /*
1118     if (nofinalvalue) {
1119 #if defined(PETSC_USE_COMPLEX)
1120       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1121 #else
1122       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1123 #endif
1124     }
1125     */
1126     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1127     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1128     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1129   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1130     PetscFunctionReturn(PETSC_SUCCESS);
1131   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1132     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1133     for (i = 0; i < m; i++) {
1134       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1135       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1136       for (j = 0; j < a->rlen[i]; j++) {
1137 #if defined(PETSC_USE_COMPLEX)
1138         if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1139           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1140         } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1141           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1142         } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1143           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1144         }
1145 #else
1146         if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1147 #endif
1148       }
1149       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1150     }
1151     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1152   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1153     PetscInt    cnt = 0, jcnt;
1154     PetscScalar value;
1155 #if defined(PETSC_USE_COMPLEX)
1156     PetscBool realonly = PETSC_TRUE;
1157     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1158       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1159         realonly = PETSC_FALSE;
1160         break;
1161       }
1162     }
1163 #endif
1164 
1165     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1166     for (i = 0; i < m; i++) {
1167       jcnt  = 0;
1168       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1169       for (j = 0; j < A->cmap->n; j++) {
1170         if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1171           value = a->val[cnt++];
1172           jcnt++;
1173         } else {
1174           value = 0.0;
1175         }
1176 #if defined(PETSC_USE_COMPLEX)
1177         if (realonly) {
1178           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1179         } else {
1180           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1181         }
1182 #else
1183         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1184 #endif
1185       }
1186       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1187     }
1188     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1189   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1190     PetscInt fshift = 1;
1191     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1192 #if defined(PETSC_USE_COMPLEX)
1193     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1194 #else
1195     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1196 #endif
1197     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1198     for (i = 0; i < m; i++) {
1199       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1200       for (j = 0; j < a->rlen[i]; j++) {
1201 #if defined(PETSC_USE_COMPLEX)
1202         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1203 #else
1204         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1205 #endif
1206       }
1207     }
1208     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1209   } else if (format == PETSC_VIEWER_NATIVE) {
1210     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1211       PetscInt row;
1212       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1213       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1214 #if defined(PETSC_USE_COMPLEX)
1215         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1216           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1217         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1218           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1219         } else {
1220           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1221         }
1222 #else
1223         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1224 #endif
1225       }
1226     }
1227   } else {
1228     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1229     if (A->factortype) {
1230       for (i = 0; i < m; i++) {
1231         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1232         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1233         /* L part */
1234         for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1235 #if defined(PETSC_USE_COMPLEX)
1236           if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1237             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1238           } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1239             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1240           } else {
1241             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1242           }
1243 #else
1244           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1245 #endif
1246         }
1247         /* diagonal */
1248         j = a->diag[i];
1249 #if defined(PETSC_USE_COMPLEX)
1250         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1251           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1252         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1253           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1254         } else {
1255           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1256         }
1257 #else
1258         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1259 #endif
1260 
1261         /* U part */
1262         for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1263 #if defined(PETSC_USE_COMPLEX)
1264           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1265             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1266           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1267             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1268           } else {
1269             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1270           }
1271 #else
1272           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1273 #endif
1274         }
1275         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1276       }
1277     } else {
1278       for (i = 0; i < m; i++) {
1279         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1280         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1281         for (j = 0; j < a->rlen[i]; j++) {
1282 #if defined(PETSC_USE_COMPLEX)
1283           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1284             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1285           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1286             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1287           } else {
1288             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1289           }
1290 #else
1291           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1292 #endif
1293         }
1294         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1295       }
1296     }
1297     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1298   }
1299   PetscCall(PetscViewerFlush(viewer));
1300   PetscFunctionReturn(PETSC_SUCCESS);
1301 }
1302 
1303 #include <petscdraw.h>
1304 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1305 {
1306   Mat               A = (Mat)Aa;
1307   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1308   PetscInt          i, j, m = A->rmap->n, shift;
1309   int               color;
1310   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1311   PetscViewer       viewer;
1312   PetscViewerFormat format;
1313 
1314   PetscFunctionBegin;
1315   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1316   PetscCall(PetscViewerGetFormat(viewer, &format));
1317   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1318 
1319   /* loop over matrix elements drawing boxes */
1320 
1321   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1322     PetscDrawCollectiveBegin(draw);
1323     /* Blue for negative, Cyan for zero and  Red for positive */
1324     color = PETSC_DRAW_BLUE;
1325     for (i = 0; i < m; i++) {
1326       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1327       y_l   = m - i - 1.0;
1328       y_r   = y_l + 1.0;
1329       for (j = 0; j < a->rlen[i]; j++) {
1330         x_l = a->colidx[shift + a->sliceheight * j];
1331         x_r = x_l + 1.0;
1332         if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1333         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1334       }
1335     }
1336     color = PETSC_DRAW_CYAN;
1337     for (i = 0; i < m; i++) {
1338       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1339       y_l   = m - i - 1.0;
1340       y_r   = y_l + 1.0;
1341       for (j = 0; j < a->rlen[i]; j++) {
1342         x_l = a->colidx[shift + a->sliceheight * j];
1343         x_r = x_l + 1.0;
1344         if (a->val[shift + a->sliceheight * j] != 0.) continue;
1345         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1346       }
1347     }
1348     color = PETSC_DRAW_RED;
1349     for (i = 0; i < m; i++) {
1350       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1351       y_l   = m - i - 1.0;
1352       y_r   = y_l + 1.0;
1353       for (j = 0; j < a->rlen[i]; j++) {
1354         x_l = a->colidx[shift + a->sliceheight * j];
1355         x_r = x_l + 1.0;
1356         if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1357         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1358       }
1359     }
1360     PetscDrawCollectiveEnd(draw);
1361   } else {
1362     /* use contour shading to indicate magnitude of values */
1363     /* first determine max of all nonzero values */
1364     PetscReal minv = 0.0, maxv = 0.0;
1365     PetscInt  count = 0;
1366     PetscDraw popup;
1367     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1368       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1369     }
1370     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1371     PetscCall(PetscDrawGetPopup(draw, &popup));
1372     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1373 
1374     PetscDrawCollectiveBegin(draw);
1375     for (i = 0; i < m; i++) {
1376       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1377       y_l   = m - i - 1.0;
1378       y_r   = y_l + 1.0;
1379       for (j = 0; j < a->rlen[i]; j++) {
1380         x_l   = a->colidx[shift + a->sliceheight * j];
1381         x_r   = x_l + 1.0;
1382         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1383         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1384         count++;
1385       }
1386     }
1387     PetscDrawCollectiveEnd(draw);
1388   }
1389   PetscFunctionReturn(PETSC_SUCCESS);
1390 }
1391 
1392 #include <petscdraw.h>
1393 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1394 {
1395   PetscDraw draw;
1396   PetscReal xr, yr, xl, yl, h, w;
1397   PetscBool isnull;
1398 
1399   PetscFunctionBegin;
1400   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1401   PetscCall(PetscDrawIsNull(draw, &isnull));
1402   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1403 
1404   xr = A->cmap->n;
1405   yr = A->rmap->n;
1406   h  = yr / 10.0;
1407   w  = xr / 10.0;
1408   xr += w;
1409   yr += h;
1410   xl = -w;
1411   yl = -h;
1412   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1413   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1414   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1415   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1416   PetscCall(PetscDrawSave(draw));
1417   PetscFunctionReturn(PETSC_SUCCESS);
1418 }
1419 
1420 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1421 {
1422   PetscBool iascii, isbinary, isdraw;
1423 
1424   PetscFunctionBegin;
1425   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1426   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1427   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1428   if (iascii) {
1429     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1430   } else if (isbinary) {
1431     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1432   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1433   PetscFunctionReturn(PETSC_SUCCESS);
1434 }
1435 
1436 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1437 {
1438   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1439   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1440   MatScalar   *vp;
1441 #if defined(PETSC_HAVE_CUDA)
1442   PetscInt totalchunks = 0;
1443 #endif
1444 
1445   PetscFunctionBegin;
1446   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1447   /* To do: compress out the unused elements */
1448   PetscCall(MatMarkDiagonal_SeqSELL(A));
1449   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1450   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1451   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1452   a->nonzerorowcnt = 0;
1453   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1454   for (i = 0; i < a->totalslices; ++i) {
1455     shift = a->sliidx[i];                                                   /* starting index of the slice */
1456     cp    = a->colidx + shift;                                              /* pointer to the column indices of the slice */
1457     vp    = a->val + shift;                                                 /* pointer to the nonzero values of the slice */
1458     for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1459       row  = a->sliceheight * i + row_in_slice;
1460       nrow = a->rlen[row]; /* number of nonzeros in row */
1461       /*
1462         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1463         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1464       */
1465       lastcol = 0;
1466       if (nrow > 0) { /* nonempty row */
1467         a->nonzerorowcnt++;
1468         lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1469       } else if (!row_in_slice) {                                 /* first row of the correct slice is empty */
1470         for (j = 1; j < a->sliceheight; j++) {
1471           if (a->rlen[a->sliceheight * i + j]) {
1472             lastcol = cp[j];
1473             break;
1474           }
1475         }
1476       } else {
1477         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1478       }
1479 
1480       for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1481         cp[a->sliceheight * k + row_in_slice] = lastcol;
1482         vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1483       }
1484     }
1485   }
1486 
1487   A->info.mallocs += a->reallocs;
1488   a->reallocs = 0;
1489 
1490   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1491 #if defined(PETSC_HAVE_CUDA)
1492   if (!a->chunksize && a->totalslices) {
1493     a->chunksize = 64;
1494     while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2;
1495     totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize;
1496   }
1497   if (totalchunks != a->totalchunks) {
1498     PetscCall(PetscFree(a->chunk_slice_map));
1499     PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map));
1500     a->totalchunks = totalchunks;
1501   }
1502   j = 0;
1503   for (i = 0; i < totalchunks; i++) {
1504     while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++;
1505     a->chunk_slice_map[i] = j;
1506   }
1507 #endif
1508   PetscFunctionReturn(PETSC_SUCCESS);
1509 }
1510 
1511 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1512 {
1513   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1514 
1515   PetscFunctionBegin;
1516   info->block_size   = 1.0;
1517   info->nz_allocated = a->maxallocmat;
1518   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1519   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1520   info->assemblies   = A->num_ass;
1521   info->mallocs      = A->info.mallocs;
1522   info->memory       = 0; /* REVIEW ME */
1523   if (A->factortype) {
1524     info->fill_ratio_given  = A->info.fill_ratio_given;
1525     info->fill_ratio_needed = A->info.fill_ratio_needed;
1526     info->factor_mallocs    = A->info.factor_mallocs;
1527   } else {
1528     info->fill_ratio_given  = 0;
1529     info->fill_ratio_needed = 0;
1530     info->factor_mallocs    = 0;
1531   }
1532   PetscFunctionReturn(PETSC_SUCCESS);
1533 }
1534 
1535 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1536 {
1537   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1538   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1539   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1540   MatScalar   *vp, value;
1541 #if defined(PETSC_HAVE_CUDA)
1542   PetscBool inserted = PETSC_FALSE;
1543   PetscInt  mul      = DEVICE_MEM_ALIGN / a->sliceheight;
1544 #endif
1545 
1546   PetscFunctionBegin;
1547   for (k = 0; k < m; k++) { /* loop over added rows */
1548     row = im[k];
1549     if (row < 0) continue;
1550     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1551     shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1552     cp    = a->colidx + shift;                                      /* pointer to the row */
1553     vp    = a->val + shift;                                         /* pointer to the row */
1554     nrow  = a->rlen[row];
1555     low   = 0;
1556     high  = nrow;
1557 
1558     for (l = 0; l < n; l++) { /* loop over added columns */
1559       col = in[l];
1560       if (col < 0) continue;
1561       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1562       if (a->roworiented) {
1563         value = v[l + k * n];
1564       } else {
1565         value = v[k + l * m];
1566       }
1567       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1568 
1569       /* search in this row for the specified column, i indicates the column to be set */
1570       if (col <= lastcol) low = 0;
1571       else high = nrow;
1572       lastcol = col;
1573       while (high - low > 5) {
1574         t = (low + high) / 2;
1575         if (*(cp + a->sliceheight * t) > col) high = t;
1576         else low = t;
1577       }
1578       for (i = low; i < high; i++) {
1579         if (*(cp + a->sliceheight * i) > col) break;
1580         if (*(cp + a->sliceheight * i) == col) {
1581           if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1582           else *(vp + a->sliceheight * i) = value;
1583 #if defined(PETSC_HAVE_CUDA)
1584           inserted = PETSC_TRUE;
1585 #endif
1586           low = i + 1;
1587           goto noinsert;
1588         }
1589       }
1590       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1591       if (nonew == 1) goto noinsert;
1592       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1593 #if defined(PETSC_HAVE_CUDA)
1594       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1595 #else
1596       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1597       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1598 #endif
1599       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1600       for (ii = nrow - 1; ii >= i; ii--) {
1601         *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1602         *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1603       }
1604       a->rlen[row]++;
1605       *(cp + a->sliceheight * i) = col;
1606       *(vp + a->sliceheight * i) = value;
1607       a->nz++;
1608       A->nonzerostate++;
1609 #if defined(PETSC_HAVE_CUDA)
1610       inserted = PETSC_TRUE;
1611 #endif
1612       low = i + 1;
1613       high++;
1614       nrow++;
1615     noinsert:;
1616     }
1617     a->rlen[row] = nrow;
1618   }
1619 #if defined(PETSC_HAVE_CUDA)
1620   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1621 #endif
1622   PetscFunctionReturn(PETSC_SUCCESS);
1623 }
1624 
1625 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1626 {
1627   PetscFunctionBegin;
1628   /* If the two matrices have the same copy implementation, use fast copy. */
1629   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1630     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1631     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1632 
1633     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1634     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1635   } else {
1636     PetscCall(MatCopy_Basic(A, B, str));
1637   }
1638   PetscFunctionReturn(PETSC_SUCCESS);
1639 }
1640 
1641 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1642 {
1643   PetscFunctionBegin;
1644   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1645   PetscFunctionReturn(PETSC_SUCCESS);
1646 }
1647 
1648 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1649 {
1650   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1651 
1652   PetscFunctionBegin;
1653   *array = a->val;
1654   PetscFunctionReturn(PETSC_SUCCESS);
1655 }
1656 
1657 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1658 {
1659   PetscFunctionBegin;
1660   PetscFunctionReturn(PETSC_SUCCESS);
1661 }
1662 
1663 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1664 {
1665   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1666   PetscInt     i;
1667   MatScalar   *aval = a->val;
1668 
1669   PetscFunctionBegin;
1670   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1671 #if defined(PETSC_HAVE_CUDA)
1672   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1673 #endif
1674   PetscFunctionReturn(PETSC_SUCCESS);
1675 }
1676 
1677 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1678 {
1679   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1680   PetscInt     i;
1681   MatScalar   *aval = a->val;
1682 
1683   PetscFunctionBegin;
1684   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1685   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1686 #if defined(PETSC_HAVE_CUDA)
1687   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1688 #endif
1689   PetscFunctionReturn(PETSC_SUCCESS);
1690 }
1691 
1692 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1693 {
1694   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1695   MatScalar   *aval   = a->val;
1696   PetscScalar  oalpha = alpha;
1697   PetscBLASInt one    = 1, size;
1698 
1699   PetscFunctionBegin;
1700   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1701   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1702   PetscCall(PetscLogFlops(a->nz));
1703   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1704 #if defined(PETSC_HAVE_CUDA)
1705   if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1706 #endif
1707   PetscFunctionReturn(PETSC_SUCCESS);
1708 }
1709 
1710 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1711 {
1712   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1713 
1714   PetscFunctionBegin;
1715   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1716   PetscCall(MatShift_Basic(Y, a));
1717   PetscFunctionReturn(PETSC_SUCCESS);
1718 }
1719 
1720 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1721 {
1722   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1723   PetscScalar       *x, sum, *t;
1724   const MatScalar   *idiag = NULL, *mdiag;
1725   const PetscScalar *b, *xb;
1726   PetscInt           n, m = A->rmap->n, i, j, shift;
1727   const PetscInt    *diag;
1728 
1729   PetscFunctionBegin;
1730   its = its * lits;
1731 
1732   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1733   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1734   a->fshift = fshift;
1735   a->omega  = omega;
1736 
1737   diag  = a->diag;
1738   t     = a->ssor_work;
1739   idiag = a->idiag;
1740   mdiag = a->mdiag;
1741 
1742   PetscCall(VecGetArray(xx, &x));
1743   PetscCall(VecGetArrayRead(bb, &b));
1744   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1745   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1746   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1747   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1748 
1749   if (flag & SOR_ZERO_INITIAL_GUESS) {
1750     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1751       for (i = 0; i < m; i++) {
1752         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1753         sum   = b[i];
1754         n     = (diag[i] - shift) / a->sliceheight;
1755         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1756         t[i] = sum;
1757         x[i] = sum * idiag[i];
1758       }
1759       xb = t;
1760       PetscCall(PetscLogFlops(a->nz));
1761     } else xb = b;
1762     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1763       for (i = m - 1; i >= 0; i--) {
1764         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1765         sum   = xb[i];
1766         n     = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1767         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1768         if (xb == b) {
1769           x[i] = sum * idiag[i];
1770         } else {
1771           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1772         }
1773       }
1774       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1775     }
1776     its--;
1777   }
1778   while (its--) {
1779     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1780       for (i = 0; i < m; i++) {
1781         /* lower */
1782         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1783         sum   = b[i];
1784         n     = (diag[i] - shift) / a->sliceheight;
1785         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1786         t[i] = sum; /* save application of the lower-triangular part */
1787         /* upper */
1788         n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1789         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1790         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1791       }
1792       xb = t;
1793       PetscCall(PetscLogFlops(2.0 * a->nz));
1794     } else xb = b;
1795     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1796       for (i = m - 1; i >= 0; i--) {
1797         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1798         sum   = xb[i];
1799         if (xb == b) {
1800           /* whole matrix (no checkpointing available) */
1801           n = a->rlen[i];
1802           for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1803           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1804         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1805           n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1806           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1807           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1808         }
1809       }
1810       if (xb == b) {
1811         PetscCall(PetscLogFlops(2.0 * a->nz));
1812       } else {
1813         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1814       }
1815     }
1816   }
1817   PetscCall(VecRestoreArray(xx, &x));
1818   PetscCall(VecRestoreArrayRead(bb, &b));
1819   PetscFunctionReturn(PETSC_SUCCESS);
1820 }
1821 
1822 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1823                                        MatGetRow_SeqSELL,
1824                                        MatRestoreRow_SeqSELL,
1825                                        MatMult_SeqSELL,
1826                                        /* 4*/ MatMultAdd_SeqSELL,
1827                                        MatMultTranspose_SeqSELL,
1828                                        MatMultTransposeAdd_SeqSELL,
1829                                        NULL,
1830                                        NULL,
1831                                        NULL,
1832                                        /* 10*/ NULL,
1833                                        NULL,
1834                                        NULL,
1835                                        MatSOR_SeqSELL,
1836                                        NULL,
1837                                        /* 15*/ MatGetInfo_SeqSELL,
1838                                        MatEqual_SeqSELL,
1839                                        MatGetDiagonal_SeqSELL,
1840                                        MatDiagonalScale_SeqSELL,
1841                                        NULL,
1842                                        /* 20*/ NULL,
1843                                        MatAssemblyEnd_SeqSELL,
1844                                        MatSetOption_SeqSELL,
1845                                        MatZeroEntries_SeqSELL,
1846                                        /* 24*/ NULL,
1847                                        NULL,
1848                                        NULL,
1849                                        NULL,
1850                                        NULL,
1851                                        /* 29*/ MatSetUp_SeqSELL,
1852                                        NULL,
1853                                        NULL,
1854                                        NULL,
1855                                        NULL,
1856                                        /* 34*/ MatDuplicate_SeqSELL,
1857                                        NULL,
1858                                        NULL,
1859                                        NULL,
1860                                        NULL,
1861                                        /* 39*/ NULL,
1862                                        NULL,
1863                                        NULL,
1864                                        MatGetValues_SeqSELL,
1865                                        MatCopy_SeqSELL,
1866                                        /* 44*/ NULL,
1867                                        MatScale_SeqSELL,
1868                                        MatShift_SeqSELL,
1869                                        NULL,
1870                                        NULL,
1871                                        /* 49*/ NULL,
1872                                        NULL,
1873                                        NULL,
1874                                        NULL,
1875                                        NULL,
1876                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1877                                        NULL,
1878                                        NULL,
1879                                        NULL,
1880                                        NULL,
1881                                        /* 59*/ NULL,
1882                                        MatDestroy_SeqSELL,
1883                                        MatView_SeqSELL,
1884                                        NULL,
1885                                        NULL,
1886                                        /* 64*/ NULL,
1887                                        NULL,
1888                                        NULL,
1889                                        NULL,
1890                                        NULL,
1891                                        /* 69*/ NULL,
1892                                        NULL,
1893                                        NULL,
1894                                        NULL,
1895                                        NULL,
1896                                        /* 74*/ NULL,
1897                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1898                                        NULL,
1899                                        NULL,
1900                                        NULL,
1901                                        /* 79*/ NULL,
1902                                        NULL,
1903                                        NULL,
1904                                        NULL,
1905                                        NULL,
1906                                        /* 84*/ NULL,
1907                                        NULL,
1908                                        NULL,
1909                                        NULL,
1910                                        NULL,
1911                                        /* 89*/ NULL,
1912                                        NULL,
1913                                        NULL,
1914                                        NULL,
1915                                        NULL,
1916                                        /* 94*/ NULL,
1917                                        NULL,
1918                                        NULL,
1919                                        NULL,
1920                                        NULL,
1921                                        /* 99*/ NULL,
1922                                        NULL,
1923                                        NULL,
1924                                        MatConjugate_SeqSELL,
1925                                        NULL,
1926                                        /*104*/ NULL,
1927                                        NULL,
1928                                        NULL,
1929                                        NULL,
1930                                        NULL,
1931                                        /*109*/ NULL,
1932                                        NULL,
1933                                        NULL,
1934                                        NULL,
1935                                        MatMissingDiagonal_SeqSELL,
1936                                        /*114*/ NULL,
1937                                        NULL,
1938                                        NULL,
1939                                        NULL,
1940                                        NULL,
1941                                        /*119*/ NULL,
1942                                        NULL,
1943                                        NULL,
1944                                        NULL,
1945                                        NULL,
1946                                        /*124*/ NULL,
1947                                        NULL,
1948                                        NULL,
1949                                        NULL,
1950                                        NULL,
1951                                        /*129*/ NULL,
1952                                        NULL,
1953                                        NULL,
1954                                        NULL,
1955                                        NULL,
1956                                        /*134*/ NULL,
1957                                        NULL,
1958                                        NULL,
1959                                        NULL,
1960                                        NULL,
1961                                        /*139*/ NULL,
1962                                        NULL,
1963                                        NULL,
1964                                        MatFDColoringSetUp_SeqXAIJ,
1965                                        NULL,
1966                                        /*144*/ NULL,
1967                                        NULL,
1968                                        NULL,
1969                                        NULL,
1970                                        NULL,
1971                                        NULL,
1972                                        /*150*/ NULL,
1973                                        NULL};
1974 
1975 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1976 {
1977   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1978 
1979   PetscFunctionBegin;
1980   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1981 
1982   /* allocate space for values if not already there */
1983   if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1984 
1985   /* copy values over */
1986   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1987   PetscFunctionReturn(PETSC_SUCCESS);
1988 }
1989 
1990 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1991 {
1992   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1993 
1994   PetscFunctionBegin;
1995   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1996   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1997   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1998   PetscFunctionReturn(PETSC_SUCCESS);
1999 }
2000 
2001 PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
2002 {
2003   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2004 
2005   PetscFunctionBegin;
2006   if (a->totalslices && a->sliidx[a->totalslices]) {
2007     *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
2008   } else {
2009     *ratio = 0.0;
2010   }
2011   PetscFunctionReturn(PETSC_SUCCESS);
2012 }
2013 
2014 PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
2015 {
2016   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2017   PetscInt     i, current_slicewidth;
2018 
2019   PetscFunctionBegin;
2020   *slicewidth = 0;
2021   for (i = 0; i < a->totalslices; i++) {
2022     current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
2023     if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
2024   }
2025   PetscFunctionReturn(PETSC_SUCCESS);
2026 }
2027 
2028 PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
2029 {
2030   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2031 
2032   PetscFunctionBegin;
2033   *slicewidth = 0;
2034   if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; }
2035   PetscFunctionReturn(PETSC_SUCCESS);
2036 }
2037 
2038 PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance)
2039 {
2040   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2041   PetscReal    mean;
2042   PetscInt     i, totalslices = a->totalslices, *sliidx = a->sliidx;
2043 
2044   PetscFunctionBegin;
2045   *variance = 0;
2046   if (totalslices) {
2047     mean = (PetscReal)sliidx[totalslices] / totalslices;
2048     for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; }
2049   }
2050   PetscFunctionReturn(PETSC_SUCCESS);
2051 }
2052 
2053 PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2054 {
2055   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2056 
2057   PetscFunctionBegin;
2058   if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2059   PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2060   a->sliceheight = sliceheight;
2061 #if defined(PETSC_HAVE_CUDA)
2062   PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight);
2063 #endif
2064   PetscFunctionReturn(PETSC_SUCCESS);
2065 }
2066 
2067 /*@C
2068   MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
2069 
2070   Not Collective
2071 
2072   Input Parameters:
2073 +  A - a `MATSEQSELL` matrix
2074 -  array - pointer to the data
2075 
2076   Level: intermediate
2077 
2078 .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
2079 @*/
2080 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
2081 {
2082   PetscFunctionBegin;
2083   PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
2084   PetscFunctionReturn(PETSC_SUCCESS);
2085 }
2086 
2087 /*@C
2088   MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2089 
2090   Not Collective
2091 
2092   Input Parameter:
2093 .  A - a MATSEQSELL matrix
2094 
2095   Output Parameter:
2096 .  ratio - ratio of number of padded zeros to number of allocated elements
2097 
2098   Level: intermediate
2099 @*/
2100 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2101 {
2102   PetscFunctionBegin;
2103   PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2104   PetscFunctionReturn(PETSC_SUCCESS);
2105 }
2106 
2107 /*@C
2108   MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2109 
2110   Not Collective
2111 
2112   Input Parameter:
2113 .  A - a MATSEQSELL matrix
2114 
2115   Output Parameter:
2116 .  slicewidth - maximum slice width
2117 
2118  Level: intermediate
2119 @*/
2120 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2121 {
2122   PetscFunctionBegin;
2123   PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2124   PetscFunctionReturn(PETSC_SUCCESS);
2125 }
2126 
2127 /*@C
2128   MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2129 
2130   Not Collective
2131 
2132   Input Parameter:
2133 .  A - a MATSEQSELL matrix
2134 
2135   Output Parameter:
2136 .  slicewidth - average slice width
2137 
2138   Level: intermediate
2139 @*/
2140 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2141 {
2142   PetscFunctionBegin;
2143   PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2144   PetscFunctionReturn(PETSC_SUCCESS);
2145 }
2146 
2147 /*@C
2148   MatSeqSELLSetSliceHeight - sets the slice height.
2149 
2150   Not Collective
2151 
2152   Input Parameters:
2153 +  A - a MATSEQSELL matrix
2154 -  sliceheight - slice height
2155 
2156   Notes:
2157   You cannot change the slice height once it have been set.
2158 
2159   The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2160 
2161   Level: intermediate
2162 @*/
2163 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2164 {
2165   PetscFunctionBegin;
2166   PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2167   PetscFunctionReturn(PETSC_SUCCESS);
2168 }
2169 
2170 /*@C
2171   MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2172 
2173   Not Collective
2174 
2175   Input Parameter:
2176 .  A - a MATSEQSELL matrix
2177 
2178   Output Parameter:
2179 .  variance - variance of the slice size
2180 
2181   Level: intermediate
2182 @*/
2183 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2184 {
2185   PetscFunctionBegin;
2186   PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2187   PetscFunctionReturn(PETSC_SUCCESS);
2188 }
2189 
2190 #if defined(PETSC_HAVE_CUDA)
2191 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2192 #endif
2193 
2194 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2195 {
2196   Mat_SeqSELL *b;
2197   PetscMPIInt  size;
2198 
2199   PetscFunctionBegin;
2200   PetscCall(PetscCitationsRegister(citation, &cited));
2201   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2202   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2203 
2204   PetscCall(PetscNew(&b));
2205 
2206   B->data   = (void *)b;
2207   B->ops[0] = MatOps_Values;
2208 
2209   b->row                = NULL;
2210   b->col                = NULL;
2211   b->icol               = NULL;
2212   b->reallocs           = 0;
2213   b->ignorezeroentries  = PETSC_FALSE;
2214   b->roworiented        = PETSC_TRUE;
2215   b->nonew              = 0;
2216   b->diag               = NULL;
2217   b->solve_work         = NULL;
2218   B->spptr              = NULL;
2219   b->saved_values       = NULL;
2220   b->idiag              = NULL;
2221   b->mdiag              = NULL;
2222   b->ssor_work          = NULL;
2223   b->omega              = 1.0;
2224   b->fshift             = 0.0;
2225   b->idiagvalid         = PETSC_FALSE;
2226   b->keepnonzeropattern = PETSC_FALSE;
2227   b->sliceheight        = 0;
2228 
2229   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2230   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2231   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2232   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2233   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2234   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2235   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
2236 #if defined(PETSC_HAVE_CUDA)
2237   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA));
2238 #endif
2239   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2240   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2241   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2242   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL));
2243   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2244 
2245   PetscObjectOptionsBegin((PetscObject)B);
2246   {
2247     PetscInt  newsh = -1;
2248     PetscBool flg;
2249 #if defined(PETSC_HAVE_CUDA)
2250     PetscInt chunksize = 0;
2251 #endif
2252 
2253     PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2254     if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); }
2255 #if defined(PETSC_HAVE_CUDA)
2256     PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg));
2257     if (flg) {
2258       PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize);
2259       b->chunksize = chunksize;
2260     }
2261 #endif
2262   }
2263   PetscOptionsEnd();
2264   PetscFunctionReturn(PETSC_SUCCESS);
2265 }
2266 
2267 /*
2268  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2269  */
2270 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2271 {
2272   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2273   PetscInt     i, m                           = A->rmap->n;
2274   PetscInt     totalslices = a->totalslices;
2275 
2276   PetscFunctionBegin;
2277   C->factortype = A->factortype;
2278   c->row        = NULL;
2279   c->col        = NULL;
2280   c->icol       = NULL;
2281   c->reallocs   = 0;
2282   C->assembled  = PETSC_TRUE;
2283 
2284   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2285   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2286 
2287   PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen));
2288   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2289 
2290   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2291   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2292 
2293   /* allocate the matrix space */
2294   if (mallocmatspace) {
2295     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2296 
2297     c->singlemalloc = PETSC_TRUE;
2298 
2299     if (m > 0) {
2300       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2301       if (cpvalues == MAT_COPY_VALUES) {
2302         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2303       } else {
2304         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2305       }
2306     }
2307   }
2308 
2309   c->ignorezeroentries = a->ignorezeroentries;
2310   c->roworiented       = a->roworiented;
2311   c->nonew             = a->nonew;
2312   if (a->diag) {
2313     PetscCall(PetscMalloc1(m, &c->diag));
2314     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2315   } else c->diag = NULL;
2316 
2317   c->solve_work         = NULL;
2318   c->saved_values       = NULL;
2319   c->idiag              = NULL;
2320   c->ssor_work          = NULL;
2321   c->keepnonzeropattern = a->keepnonzeropattern;
2322   c->free_val           = PETSC_TRUE;
2323   c->free_colidx        = PETSC_TRUE;
2324 
2325   c->maxallocmat  = a->maxallocmat;
2326   c->maxallocrow  = a->maxallocrow;
2327   c->rlenmax      = a->rlenmax;
2328   c->nz           = a->nz;
2329   C->preallocated = PETSC_TRUE;
2330 
2331   c->nonzerorowcnt = a->nonzerorowcnt;
2332   C->nonzerostate  = A->nonzerostate;
2333 
2334   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2335   PetscFunctionReturn(PETSC_SUCCESS);
2336 }
2337 
2338 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2339 {
2340   PetscFunctionBegin;
2341   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2342   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2343   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2344   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2345   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2346   PetscFunctionReturn(PETSC_SUCCESS);
2347 }
2348 
2349 /*MC
2350    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2351    based on the sliced Ellpack format
2352 
2353    Options Database Key:
2354 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2355 
2356    Level: beginner
2357 
2358 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2359 M*/
2360 
2361 /*MC
2362    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2363 
2364    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2365    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2366   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2367   for communicators controlling multiple processes.  It is recommended that you call both of
2368   the above preallocation routines for simplicity.
2369 
2370    Options Database Key:
2371 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2372 
2373   Level: beginner
2374 
2375   Notes:
2376    This format is only supported for real scalars, double precision, and 32-bit indices (the defaults).
2377 
2378    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2379    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2380 
2381   Developer Notes:
2382    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2383 
2384    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2385 .vb
2386                             (2 0  3 4)
2387    Consider the matrix A =  (5 0  6 0)
2388                             (0 0  7 8)
2389                             (0 0  9 9)
2390 
2391    symbolically the Ellpack format can be written as
2392 
2393         (2 3 4 |)           (0 2 3 |)
2394    v =  (5 6 0 |)  colidx = (0 2 2 |)
2395         --------            ---------
2396         (7 8 |)             (2 3 |)
2397         (9 9 |)             (2 3 |)
2398 
2399     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2400     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2401     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2402 
2403     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2404 
2405 .ve
2406 
2407       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2408 
2409  References:
2410 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2411    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2412 
2413 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2414 M*/
2415 
2416 /*@C
2417        MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2418 
2419  Collective
2420 
2421  Input Parameters:
2422 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
2423 .  m - number of rows
2424 .  n - number of columns
2425 .  rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2426 -  rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2427 
2428  Output Parameter:
2429 .  A - the matrix
2430 
2431  Level: intermediate
2432 
2433  Notes:
2434  It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2435  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2436  [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2437 
2438  Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2439  Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2440  allocation.
2441 
2442 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2443  @*/
2444 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2445 {
2446   PetscFunctionBegin;
2447   PetscCall(MatCreate(comm, A));
2448   PetscCall(MatSetSizes(*A, m, n, m, n));
2449   PetscCall(MatSetType(*A, MATSEQSELL));
2450   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2451   PetscFunctionReturn(PETSC_SUCCESS);
2452 }
2453 
2454 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2455 {
2456   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2457   PetscInt     totalslices = a->totalslices;
2458 
2459   PetscFunctionBegin;
2460   /* If the  matrix dimensions are not equal,or no of nonzeros */
2461   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2462     *flg = PETSC_FALSE;
2463     PetscFunctionReturn(PETSC_SUCCESS);
2464   }
2465   /* if the a->colidx are the same */
2466   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2467   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2468   /* if a->val are the same */
2469   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2470   PetscFunctionReturn(PETSC_SUCCESS);
2471 }
2472 
2473 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2474 {
2475   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2476 
2477   PetscFunctionBegin;
2478   a->idiagvalid = PETSC_FALSE;
2479   PetscFunctionReturn(PETSC_SUCCESS);
2480 }
2481 
2482 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2483 {
2484 #if defined(PETSC_USE_COMPLEX)
2485   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2486   PetscInt     i;
2487   PetscScalar *val = a->val;
2488 
2489   PetscFunctionBegin;
2490   for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); }
2491   #if defined(PETSC_HAVE_CUDA)
2492   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2493   #endif
2494 #else
2495   PetscFunctionBegin;
2496 #endif
2497   PetscFunctionReturn(PETSC_SUCCESS);
2498 }
2499