xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 27f169480a6668db3ba90f8ef6ef68d542d113fa)
1 /*
2   Defines the basic matrix operations for the SELL matrix storage format.
3 */
4 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
5 #include <petscblaslapack.h>
6 #include <petsc/private/kernels/blocktranspose.h>
7 
8 static PetscBool  cited      = PETSC_FALSE;
9 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
10                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
11                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
12                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
13                                " year = 2018\n"
14                                "}\n";
15 
16 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
17 
18   #include <immintrin.h>
19 
20   #if !defined(_MM_SCALE_8)
21     #define _MM_SCALE_8 8
22   #endif
23 
24   #if defined(__AVX512F__)
25     /* these do not work
26    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
27    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
28   */
29     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
30       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
31       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
32       vec_vals = _mm512_loadu_pd(aval); \
33       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
34       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
35   #elif defined(__AVX2__) && defined(__FMA__)
36     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
37       vec_vals = _mm256_loadu_pd(aval); \
38       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
39       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
40       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
41   #endif
42 #endif /* PETSC_HAVE_IMMINTRIN_H */
43 
44 /*@
45   MatSeqSELLSetPreallocation - For good matrix assembly performance
46   the user should preallocate the matrix storage by setting the parameter `nz`
47   (or the array `nnz`).
48 
49   Collective
50 
51   Input Parameters:
52 + B       - The `MATSEQSELL` matrix
53 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
54 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
55 
56   Level: intermediate
57 
58   Notes:
59   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
60   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
61   allocation.
62 
63   You can call `MatGetInfo()` to get information on how effective the preallocation was;
64   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
65   You can also run with the option `-info` and look for messages with the string
66   malloc in them to see if additional memory allocation was needed.
67 
68   Developer Notes:
69   Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
70   entries or columns indices.
71 
72   The maximum number of nonzeos in any row should be as accurate as possible.
73   If it is underestimated, you will get bad performance due to reallocation
74   (`MatSeqXSELLReallocateSELL()`).
75 
76 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
77  @*/
MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])78 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
79 {
80   PetscFunctionBegin;
81   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
82   PetscValidType(B, 1);
83   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
84   PetscFunctionReturn(PETSC_SUCCESS);
85 }
86 
MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])87 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
88 {
89   Mat_SeqSELL *b;
90   PetscInt     i, j, totalslices;
91 #if defined(PETSC_HAVE_CUPM)
92   PetscInt rlenmax = 0;
93 #endif
94   PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
95 
96   PetscFunctionBegin;
97   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
98   if (maxallocrow == MAT_SKIP_ALLOCATION) {
99     skipallocation = PETSC_TRUE;
100     maxallocrow    = 0;
101   }
102 
103   PetscCall(PetscLayoutSetUp(B->rmap));
104   PetscCall(PetscLayoutSetUp(B->cmap));
105 
106   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
107   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
108   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
109   if (rlen) {
110     for (i = 0; i < B->rmap->n; i++) {
111       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
112       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
113     }
114   }
115 
116   B->preallocated = PETSC_TRUE;
117 
118   b = (Mat_SeqSELL *)B->data;
119 
120   if (!b->sliceheight) { /* not set yet */
121 #if defined(PETSC_HAVE_CUPM)
122     b->sliceheight = 16;
123 #else
124     b->sliceheight = 8;
125 #endif
126   }
127   totalslices    = PetscCeilInt(B->rmap->n, b->sliceheight);
128   b->totalslices = totalslices;
129   if (!skipallocation) {
130     if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
131 
132     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
133       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
134     }
135     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
136       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
137       else if (maxallocrow < 0) maxallocrow = 1;
138 #if defined(PETSC_HAVE_CUPM)
139       rlenmax = maxallocrow;
140       /* Pad the slice to DEVICE_MEM_ALIGN */
141       while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
142 #endif
143       for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
144     } else {
145 #if defined(PETSC_HAVE_CUPM)
146       PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
147 #endif
148       maxallocrow  = 0;
149       b->sliidx[0] = 0;
150       for (i = 1; i < totalslices; i++) {
151         b->sliidx[i] = 0;
152         for (j = 0; j < b->sliceheight; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]);
153 #if defined(PETSC_HAVE_CUPM)
154         if (mul != 0) { /* Pad the slice to DEVICE_MEM_ALIGN if sliceheight < DEVICE_MEM_ALIGN */
155           rlenmax      = PetscMax(b->sliidx[i], rlenmax);
156           b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
157         }
158 #endif
159         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
160         PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
161       }
162       /* last slice */
163       b->sliidx[totalslices] = 0;
164       for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
165 #if defined(PETSC_HAVE_CUPM)
166       if (mul != 0) {
167         rlenmax                = PetscMax(b->sliidx[i], rlenmax);
168         b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
169       }
170 #endif
171       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
172       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
173     }
174 
175     /* allocate space for val, colidx, rlen */
176     /* FIXME: should B's old memory be unlogged? */
177     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
178     /* FIXME: assuming an element of the bit array takes 8 bits */
179     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
180     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
181     PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
182 
183     b->singlemalloc = PETSC_TRUE;
184     b->free_val     = PETSC_TRUE;
185     b->free_colidx  = PETSC_TRUE;
186   } else {
187     b->free_val    = PETSC_FALSE;
188     b->free_colidx = PETSC_FALSE;
189   }
190 
191   b->nz          = 0;
192   b->maxallocrow = maxallocrow;
193 #if defined(PETSC_HAVE_CUPM)
194   b->rlenmax = rlenmax;
195 #else
196   b->rlenmax = maxallocrow;
197 #endif
198   b->maxallocmat      = b->sliidx[totalslices];
199   B->info.nz_unneeded = (double)b->maxallocmat;
200   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
201   PetscFunctionReturn(PETSC_SUCCESS);
202 }
203 
MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt * nz,PetscInt ** idx,PetscScalar ** v)204 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
205 {
206   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
207   PetscInt     shift;
208 
209   PetscFunctionBegin;
210   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
211   if (nz) *nz = a->rlen[row];
212   shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
213   if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals));
214   if (idx) {
215     PetscInt j;
216     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
217     *idx = a->getrowcols;
218   }
219   if (v) {
220     PetscInt j;
221     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
222     *v = a->getrowvals;
223   }
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt * nz,PetscInt ** idx,PetscScalar ** v)227 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
228 {
229   PetscFunctionBegin;
230   PetscFunctionReturn(PETSC_SUCCESS);
231 }
232 
MatConvert_SeqSELL_SeqAIJ(Mat A,MatType newtype,MatReuse reuse,Mat * newmat)233 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
234 {
235   Mat          B;
236   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
237   PetscInt     i;
238 
239   PetscFunctionBegin;
240   if (reuse == MAT_REUSE_MATRIX) {
241     B = *newmat;
242     PetscCall(MatZeroEntries(B));
243   } else {
244     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
245     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
246     PetscCall(MatSetType(B, MATSEQAIJ));
247     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
248   }
249 
250   for (i = 0; i < A->rmap->n; i++) {
251     PetscInt     nz = 0, *cols = NULL;
252     PetscScalar *vals = NULL;
253 
254     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
255     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
256     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
257   }
258 
259   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
260   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
261   B->rmap->bs = A->rmap->bs;
262 
263   if (reuse == MAT_INPLACE_MATRIX) {
264     PetscCall(MatHeaderReplace(A, &B));
265   } else {
266     *newmat = B;
267   }
268   PetscFunctionReturn(PETSC_SUCCESS);
269 }
270 
271 #include <../src/mat/impls/aij/seq/aij.h>
272 
MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat * newmat)273 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
274 {
275   Mat                B;
276   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
277   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
278   const PetscInt    *cols;
279   const PetscScalar *vals;
280 
281   PetscFunctionBegin;
282   if (reuse == MAT_REUSE_MATRIX) {
283     B = *newmat;
284   } else {
285     if (PetscDefined(USE_DEBUG) || !a->ilen) {
286       PetscCall(PetscMalloc1(m, &rowlengths));
287       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
288     }
289     if (PetscDefined(USE_DEBUG) && a->ilen) {
290       PetscBool eq;
291       PetscCall(PetscArraycmp(rowlengths, a->ilen, m, &eq));
292       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
293       PetscCall(PetscFree(rowlengths));
294       rowlengths = a->ilen;
295     } else if (a->ilen) rowlengths = a->ilen;
296     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
297     PetscCall(MatSetSizes(B, m, n, m, n));
298     PetscCall(MatSetType(B, MATSEQSELL));
299     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
300     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
301   }
302 
303   for (row = 0; row < m; row++) {
304     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
305     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
306     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
307   }
308   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
309   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
310   B->rmap->bs = A->rmap->bs;
311 
312   if (reuse == MAT_INPLACE_MATRIX) {
313     PetscCall(MatHeaderReplace(A, &B));
314   } else {
315     *newmat = B;
316   }
317   PetscFunctionReturn(PETSC_SUCCESS);
318 }
319 
MatMult_SeqSELL(Mat A,Vec xx,Vec yy)320 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
321 {
322   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
323   PetscScalar       *y;
324   const PetscScalar *x;
325   const MatScalar   *aval        = a->val;
326   PetscInt           totalslices = a->totalslices;
327   const PetscInt    *acolidx     = a->colidx;
328   PetscInt           i, j;
329 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
330   __m512d  vec_x, vec_y, vec_vals;
331   __m256i  vec_idx;
332   __mmask8 mask;
333   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
334   __m256i  vec_idx2, vec_idx3, vec_idx4;
335 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
336   __m128i   vec_idx;
337   __m256d   vec_x, vec_y, vec_y2, vec_vals;
338   MatScalar yval;
339   PetscInt  r, rows_left, row, nnz_in_row;
340 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
341   __m128d   vec_x_tmp;
342   __m256d   vec_x, vec_y, vec_y2, vec_vals;
343   MatScalar yval;
344   PetscInt  r, rows_left, row, nnz_in_row;
345 #else
346   PetscInt     k, sliceheight = a->sliceheight;
347   PetscScalar *sum;
348 #endif
349 
350 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
351   #pragma disjoint(*x, *y, *aval)
352 #endif
353 
354   PetscFunctionBegin;
355   PetscCall(VecGetArrayRead(xx, &x));
356   PetscCall(VecGetArray(yy, &y));
357 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
358   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
359   for (i = 0; i < totalslices; i++) { /* loop over slices */
360     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
361     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
362 
363     vec_y  = _mm512_setzero_pd();
364     vec_y2 = _mm512_setzero_pd();
365     vec_y3 = _mm512_setzero_pd();
366     vec_y4 = _mm512_setzero_pd();
367 
368     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
369     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
370     case 3:
371       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
372       acolidx += 8;
373       aval += 8;
374       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
375       acolidx += 8;
376       aval += 8;
377       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
378       acolidx += 8;
379       aval += 8;
380       j += 3;
381       break;
382     case 2:
383       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
384       acolidx += 8;
385       aval += 8;
386       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
387       acolidx += 8;
388       aval += 8;
389       j += 2;
390       break;
391     case 1:
392       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
393       acolidx += 8;
394       aval += 8;
395       j += 1;
396       break;
397     }
398   #pragma novector
399     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
400       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
401       acolidx += 8;
402       aval += 8;
403       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
404       acolidx += 8;
405       aval += 8;
406       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
407       acolidx += 8;
408       aval += 8;
409       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
410       acolidx += 8;
411       aval += 8;
412     }
413 
414     vec_y = _mm512_add_pd(vec_y, vec_y2);
415     vec_y = _mm512_add_pd(vec_y, vec_y3);
416     vec_y = _mm512_add_pd(vec_y, vec_y4);
417     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
418       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
419       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
420     } else {
421       _mm512_storeu_pd(&y[8 * i], vec_y);
422     }
423   }
424 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
425   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
426   for (i = 0; i < totalslices; i++) { /* loop over full slices */
427     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
428     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
429 
430     /* last slice may have padding rows. Don't use vectorization. */
431     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
432       rows_left = A->rmap->n - 8 * i;
433       for (r = 0; r < rows_left; ++r) {
434         yval       = (MatScalar)0;
435         row        = 8 * i + r;
436         nnz_in_row = a->rlen[row];
437         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
438         y[row] = yval;
439       }
440       break;
441     }
442 
443     vec_y  = _mm256_setzero_pd();
444     vec_y2 = _mm256_setzero_pd();
445 
446   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
447   #pragma novector
448   #pragma unroll(2)
449     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
450       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
451       aval += 4;
452       acolidx += 4;
453       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
454       aval += 4;
455       acolidx += 4;
456     }
457 
458     _mm256_storeu_pd(y + i * 8, vec_y);
459     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
460   }
461 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
462   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
463   for (i = 0; i < totalslices; i++) { /* loop over full slices */
464     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
465     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
466 
467     vec_y  = _mm256_setzero_pd();
468     vec_y2 = _mm256_setzero_pd();
469 
470     /* last slice may have padding rows. Don't use vectorization. */
471     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
472       rows_left = A->rmap->n - 8 * i;
473       for (r = 0; r < rows_left; ++r) {
474         yval       = (MatScalar)0;
475         row        = 8 * i + r;
476         nnz_in_row = a->rlen[row];
477         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
478         y[row] = yval;
479       }
480       break;
481     }
482 
483   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
484   #pragma novector
485   #pragma unroll(2)
486     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
487       vec_vals  = _mm256_loadu_pd(aval);
488       vec_x_tmp = _mm_setzero_pd();
489       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
490       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
491       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
492       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
493       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
494       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
495       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
496       aval += 4;
497 
498       vec_vals  = _mm256_loadu_pd(aval);
499       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
500       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
501       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
502       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
503       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
504       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
505       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
506       aval += 4;
507     }
508 
509     _mm256_storeu_pd(y + i * 8, vec_y);
510     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
511   }
512 #else
513   PetscCall(PetscMalloc1(sliceheight, &sum));
514   for (i = 0; i < totalslices; i++) { /* loop over slices */
515     for (j = 0; j < sliceheight; j++) {
516       sum[j] = 0.0;
517       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
518     }
519     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
520       for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
521     } else {
522       for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
523     }
524   }
525   PetscCall(PetscFree(sum));
526 #endif
527 
528   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
529   PetscCall(VecRestoreArrayRead(xx, &x));
530   PetscCall(VecRestoreArray(yy, &y));
531   PetscFunctionReturn(PETSC_SUCCESS);
532 }
533 
534 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)535 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
536 {
537   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
538   PetscScalar       *y, *z;
539   const PetscScalar *x;
540   const MatScalar   *aval        = a->val;
541   PetscInt           totalslices = a->totalslices;
542   const PetscInt    *acolidx     = a->colidx;
543   PetscInt           i, j;
544 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
545   __m512d  vec_x, vec_y, vec_vals;
546   __m256i  vec_idx;
547   __mmask8 mask = 0;
548   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
549   __m256i  vec_idx2, vec_idx3, vec_idx4;
550 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
551   __m128d   vec_x_tmp;
552   __m256d   vec_x, vec_y, vec_y2, vec_vals;
553   MatScalar yval;
554   PetscInt  r, row, nnz_in_row;
555 #else
556   PetscInt     k, sliceheight = a->sliceheight;
557   PetscScalar *sum;
558 #endif
559 
560 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
561   #pragma disjoint(*x, *y, *aval)
562 #endif
563 
564   PetscFunctionBegin;
565   if (!a->nz) {
566     PetscCall(VecCopy(yy, zz));
567     PetscFunctionReturn(PETSC_SUCCESS);
568   }
569   PetscCall(VecGetArrayRead(xx, &x));
570   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
571 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
572   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
573   for (i = 0; i < totalslices; i++) { /* loop over slices */
574     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
575     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
576 
577     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
578       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
579       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
580     } else {
581       vec_y = _mm512_loadu_pd(&y[8 * i]);
582     }
583     vec_y2 = _mm512_setzero_pd();
584     vec_y3 = _mm512_setzero_pd();
585     vec_y4 = _mm512_setzero_pd();
586 
587     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
588     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
589     case 3:
590       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
591       acolidx += 8;
592       aval += 8;
593       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
594       acolidx += 8;
595       aval += 8;
596       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
597       acolidx += 8;
598       aval += 8;
599       j += 3;
600       break;
601     case 2:
602       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
603       acolidx += 8;
604       aval += 8;
605       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
606       acolidx += 8;
607       aval += 8;
608       j += 2;
609       break;
610     case 1:
611       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
612       acolidx += 8;
613       aval += 8;
614       j += 1;
615       break;
616     }
617   #pragma novector
618     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
619       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
620       acolidx += 8;
621       aval += 8;
622       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
623       acolidx += 8;
624       aval += 8;
625       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
626       acolidx += 8;
627       aval += 8;
628       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
629       acolidx += 8;
630       aval += 8;
631     }
632 
633     vec_y = _mm512_add_pd(vec_y, vec_y2);
634     vec_y = _mm512_add_pd(vec_y, vec_y3);
635     vec_y = _mm512_add_pd(vec_y, vec_y4);
636     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
637       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
638     } else {
639       _mm512_storeu_pd(&z[8 * i], vec_y);
640     }
641   }
642 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
643   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
644   for (i = 0; i < totalslices; i++) { /* loop over full slices */
645     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
646     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
647 
648     /* last slice may have padding rows. Don't use vectorization. */
649     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
650       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
651         row        = 8 * i + r;
652         yval       = (MatScalar)0.0;
653         nnz_in_row = a->rlen[row];
654         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
655         z[row] = y[row] + yval;
656       }
657       break;
658     }
659 
660     vec_y  = _mm256_loadu_pd(y + 8 * i);
661     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
662 
663     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
664     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
665       vec_vals  = _mm256_loadu_pd(aval);
666       vec_x_tmp = _mm_setzero_pd();
667       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
668       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
669       vec_x     = _mm256_setzero_pd();
670       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
671       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
672       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
673       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
674       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
675       aval += 4;
676 
677       vec_vals  = _mm256_loadu_pd(aval);
678       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
679       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
680       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
681       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
682       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
683       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
684       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
685       aval += 4;
686     }
687 
688     _mm256_storeu_pd(z + i * 8, vec_y);
689     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
690   }
691 #else
692   PetscCall(PetscMalloc1(sliceheight, &sum));
693   for (i = 0; i < totalslices; i++) { /* loop over slices */
694     for (j = 0; j < sliceheight; j++) {
695       sum[j] = 0.0;
696       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
697     }
698     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
699       for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
700     } else {
701       for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
702     }
703   }
704   PetscCall(PetscFree(sum));
705 #endif
706 
707   PetscCall(PetscLogFlops(2.0 * a->nz));
708   PetscCall(VecRestoreArrayRead(xx, &x));
709   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
710   PetscFunctionReturn(PETSC_SUCCESS);
711 }
712 
MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)713 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
714 {
715   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
716   PetscScalar       *y;
717   const PetscScalar *x;
718   const MatScalar   *aval    = a->val;
719   const PetscInt    *acolidx = a->colidx;
720   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
721 
722 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
723   #pragma disjoint(*x, *y, *aval)
724 #endif
725 
726   PetscFunctionBegin;
727   if (A->symmetric == PETSC_BOOL3_TRUE) {
728     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
729     PetscFunctionReturn(PETSC_SUCCESS);
730   }
731   if (zz != yy) PetscCall(VecCopy(zz, yy));
732 
733   if (a->nz) {
734     PetscCall(VecGetArrayRead(xx, &x));
735     PetscCall(VecGetArray(yy, &y));
736     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
737       if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
738         for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
739           row        = sliceheight * i + r;
740           nnz_in_row = a->rlen[row];
741           for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
742         }
743         break;
744       }
745       for (r = 0; r < sliceheight; ++r)
746         for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
747     }
748     PetscCall(PetscLogFlops(2.0 * a->nz));
749     PetscCall(VecRestoreArrayRead(xx, &x));
750     PetscCall(VecRestoreArray(yy, &y));
751   }
752   PetscFunctionReturn(PETSC_SUCCESS);
753 }
754 
MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)755 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
756 {
757   PetscFunctionBegin;
758   if (A->symmetric == PETSC_BOOL3_TRUE) {
759     PetscCall(MatMult_SeqSELL(A, xx, yy));
760   } else {
761     PetscCall(VecSet(yy, 0.0));
762     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
763   }
764   PetscFunctionReturn(PETSC_SUCCESS);
765 }
766 
MatGetDiagonalMarkers_SeqSELL(Mat A,const PetscInt ** diag,PetscBool * diagDense)767 static PetscErrorCode MatGetDiagonalMarkers_SeqSELL(Mat A, const PetscInt **diag, PetscBool *diagDense)
768 {
769   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
770 
771   PetscFunctionBegin;
772   if (A->factortype != MAT_FACTOR_NONE) {
773     PetscAssertPointer(diag, 2);
774     PetscCheck(!diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot check for dense diagonal with factored matrices");
775     *diag = a->diag;
776     PetscFunctionReturn(PETSC_SUCCESS);
777   }
778   PetscCheck(diag || diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "At least one of diag or diagDense must be requested");
779   if (a->diagNonzeroState != A->nonzerostate || (diag && !a->diag)) {
780     const PetscInt m = A->rmap->n;
781     PetscInt       shift;
782 
783     if (!diag && !a->diag) {
784       a->diagDense = PETSC_TRUE;
785       for (PetscInt i = 0; i < m; i++) {
786         PetscBool found = PETSC_FALSE;
787 
788         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
789         for (PetscInt j = 0; j < a->rlen[i]; j++) {
790           if (a->colidx[shift + a->sliceheight * j] == i) {
791             a->diag[i] = shift + a->sliceheight * j;
792             found      = PETSC_TRUE;
793             break;
794           }
795         }
796         if (!found) {
797           a->diagDense        = PETSC_FALSE;
798           *diagDense          = a->diagDense;
799           a->diagNonzeroState = A->nonzerostate;
800           PetscFunctionReturn(PETSC_SUCCESS);
801         }
802       }
803     } else {
804       if (!a->diag) PetscCall(PetscMalloc1(m, &a->diag));
805       a->diagDense = PETSC_TRUE;
806       for (PetscInt i = 0; i < m; i++) {
807         PetscBool found = PETSC_FALSE;
808 
809         shift      = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
810         a->diag[i] = -1;
811         for (PetscInt j = 0; j < a->rlen[i]; j++) {
812           if (a->colidx[shift + a->sliceheight * j] == i) {
813             a->diag[i] = shift + a->sliceheight * j;
814             found      = PETSC_TRUE;
815             break;
816           }
817         }
818         if (!found) a->diagDense = PETSC_FALSE;
819       }
820     }
821     a->diagNonzeroState = A->nonzerostate;
822   }
823   if (diag) *diag = a->diag;
824   if (diagDense) *diagDense = a->diagDense;
825   PetscFunctionReturn(PETSC_SUCCESS);
826 }
827 
828 /*
829   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
830 */
MatInvertDiagonalForSOR_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)831 static PetscErrorCode MatInvertDiagonalForSOR_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
832 {
833   Mat_SeqSELL    *a = (Mat_SeqSELL *)A->data;
834   PetscInt        i, m = A->rmap->n;
835   MatScalar      *val = a->val;
836   PetscScalar    *idiag, *mdiag;
837   const PetscInt *diag;
838   PetscBool       diagDense;
839 
840   PetscFunctionBegin;
841   if (a->idiagState == ((PetscObject)A)->state && a->omega == omega && a->fshift == fshift) PetscFunctionReturn(PETSC_SUCCESS);
842   PetscCall(MatGetDiagonalMarkers_SeqSELL(A, &diag, &diagDense));
843   PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must have all diagonal locations to invert them");
844 
845   if (!a->idiag) {
846     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
847     val = a->val;
848   }
849   mdiag = a->mdiag;
850   idiag = a->idiag;
851 
852   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
853     for (i = 0; i < m; i++) {
854       mdiag[i] = val[diag[i]];
855       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
856         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
857         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
858         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
859         A->factorerror_zeropivot_value = 0.0;
860         A->factorerror_zeropivot_row   = i;
861       }
862       idiag[i] = 1.0 / val[diag[i]];
863     }
864     PetscCall(PetscLogFlops(m));
865   } else {
866     for (i = 0; i < m; i++) {
867       mdiag[i] = val[diag[i]];
868       idiag[i] = omega / (fshift + val[diag[i]]);
869     }
870     PetscCall(PetscLogFlops(2.0 * m));
871   }
872   a->idiagState = ((PetscObject)A)->state;
873   a->omega      = omega;
874   a->fshift     = fshift;
875   PetscFunctionReturn(PETSC_SUCCESS);
876 }
877 
MatZeroEntries_SeqSELL(Mat A)878 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
879 {
880   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
881 
882   PetscFunctionBegin;
883   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
884   PetscFunctionReturn(PETSC_SUCCESS);
885 }
886 
MatDestroy_SeqSELL(Mat A)887 PetscErrorCode MatDestroy_SeqSELL(Mat A)
888 {
889   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
890 
891   PetscFunctionBegin;
892   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
893   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
894   PetscCall(ISDestroy(&a->row));
895   PetscCall(ISDestroy(&a->col));
896   PetscCall(PetscFree(a->diag));
897   PetscCall(PetscFree(a->rlen));
898   PetscCall(PetscFree(a->sliidx));
899   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
900   PetscCall(PetscFree(a->solve_work));
901   PetscCall(ISDestroy(&a->icol));
902   PetscCall(PetscFree(a->saved_values));
903   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
904   PetscCall(PetscFree(A->data));
905 #if defined(PETSC_HAVE_CUPM)
906   PetscCall(PetscFree(a->chunk_slice_map));
907 #endif
908 
909   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
910   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
911   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
912   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
913   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
914   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
915   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
916 #if defined(PETSC_HAVE_CUDA)
917   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL));
918 #endif
919 #if defined(PETSC_HAVE_HIP)
920   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellhip_C", NULL));
921 #endif
922   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
923   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
924   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
925   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL));
926   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
927   PetscFunctionReturn(PETSC_SUCCESS);
928 }
929 
MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)930 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
931 {
932   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
933 
934   PetscFunctionBegin;
935   switch (op) {
936   case MAT_ROW_ORIENTED:
937     a->roworiented = flg;
938     break;
939   case MAT_KEEP_NONZERO_PATTERN:
940     a->keepnonzeropattern = flg;
941     break;
942   case MAT_NEW_NONZERO_LOCATIONS:
943     a->nonew = (flg ? 0 : 1);
944     break;
945   case MAT_NEW_NONZERO_LOCATION_ERR:
946     a->nonew = (flg ? -1 : 0);
947     break;
948   case MAT_NEW_NONZERO_ALLOCATION_ERR:
949     a->nonew = (flg ? -2 : 0);
950     break;
951   case MAT_UNUSED_NONZERO_LOCATION_ERR:
952     a->nounused = (flg ? -1 : 0);
953     break;
954   default:
955     break;
956   }
957   PetscFunctionReturn(PETSC_SUCCESS);
958 }
959 
MatGetDiagonal_SeqSELL(Mat A,Vec v)960 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
961 {
962   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
963   PetscInt     i, j, n, shift;
964   PetscScalar *x, zero = 0.0;
965 
966   PetscFunctionBegin;
967   PetscCall(VecGetLocalSize(v, &n));
968   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
969 
970   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
971     const PetscInt *diag;
972 
973     PetscCall(MatGetDiagonalMarkers_SeqSELL(A, &diag, NULL));
974     PetscCall(VecGetArrayWrite(v, &x));
975     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
976     PetscCall(VecRestoreArrayWrite(v, &x));
977     PetscFunctionReturn(PETSC_SUCCESS);
978   }
979 
980   PetscCall(VecSet(v, zero));
981   PetscCall(VecGetArray(v, &x));
982   for (i = 0; i < n; i++) {                                     /* loop over rows */
983     shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
984     x[i]  = 0;
985     for (j = 0; j < a->rlen[i]; j++) {
986       if (a->colidx[shift + a->sliceheight * j] == i) {
987         x[i] = a->val[shift + a->sliceheight * j];
988         break;
989       }
990     }
991   }
992   PetscCall(VecRestoreArray(v, &x));
993   PetscFunctionReturn(PETSC_SUCCESS);
994 }
995 
MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)996 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
997 {
998   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
999   const PetscScalar *l, *r;
1000   PetscInt           i, j, m, n, row;
1001 
1002   PetscFunctionBegin;
1003   if (ll) {
1004     /* The local size is used so that VecMPI can be passed to this routine
1005        by MatDiagonalScale_MPISELL */
1006     PetscCall(VecGetLocalSize(ll, &m));
1007     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1008     PetscCall(VecGetArrayRead(ll, &l));
1009     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1010       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1011         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1012           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
1013         }
1014       } else {
1015         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) a->val[j] *= l[a->sliceheight * i + row];
1016       }
1017     }
1018     PetscCall(VecRestoreArrayRead(ll, &l));
1019     PetscCall(PetscLogFlops(a->nz));
1020   }
1021   if (rr) {
1022     PetscCall(VecGetLocalSize(rr, &n));
1023     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1024     PetscCall(VecGetArrayRead(rr, &r));
1025     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1026       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1027         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1028           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1029         }
1030       } else {
1031         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1032       }
1033     }
1034     PetscCall(VecRestoreArrayRead(rr, &r));
1035     PetscCall(PetscLogFlops(a->nz));
1036   }
1037 #if defined(PETSC_HAVE_CUPM)
1038   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1039 #endif
1040   PetscFunctionReturn(PETSC_SUCCESS);
1041 }
1042 
MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])1043 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1044 {
1045   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1046   PetscInt    *cp, i, k, low, high, t, row, col, l;
1047   PetscInt     shift;
1048   MatScalar   *vp;
1049 
1050   PetscFunctionBegin;
1051   for (k = 0; k < m; k++) { /* loop over requested rows */
1052     row = im[k];
1053     if (row < 0) continue;
1054     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1055     shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1056     cp    = a->colidx + shift;                                        /* pointer to the row */
1057     vp    = a->val + shift;                                           /* pointer to the row */
1058     for (l = 0; l < n; l++) {                                         /* loop over requested columns */
1059       col = in[l];
1060       if (col < 0) continue;
1061       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1062       high = a->rlen[row];
1063       low  = 0; /* assume unsorted */
1064       while (high - low > 5) {
1065         t = (low + high) / 2;
1066         if (*(cp + a->sliceheight * t) > col) high = t;
1067         else low = t;
1068       }
1069       for (i = low; i < high; i++) {
1070         if (*(cp + a->sliceheight * i) > col) break;
1071         if (*(cp + a->sliceheight * i) == col) {
1072           *v++ = *(vp + a->sliceheight * i);
1073           goto finished;
1074         }
1075       }
1076       *v++ = 0.0;
1077     finished:;
1078     }
1079   }
1080   PetscFunctionReturn(PETSC_SUCCESS);
1081 }
1082 
MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)1083 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1084 {
1085   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1086   PetscInt          i, j, m = A->rmap->n, shift;
1087   const char       *name;
1088   PetscViewerFormat format;
1089 
1090   PetscFunctionBegin;
1091   PetscCall(PetscViewerGetFormat(viewer, &format));
1092   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1093     PetscInt nofinalvalue = 0;
1094     /*
1095     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) nofinalvalue = 1;
1096     */
1097     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1098     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1099     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1100 #if defined(PETSC_USE_COMPLEX)
1101     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1102 #else
1103     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1104 #endif
1105     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1106 
1107     for (i = 0; i < m; i++) {
1108       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1109       for (j = 0; j < a->rlen[i]; j++) {
1110 #if defined(PETSC_USE_COMPLEX)
1111         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1112 #else
1113         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1114 #endif
1115       }
1116     }
1117     /*
1118     if (nofinalvalue) {
1119 #if defined(PETSC_USE_COMPLEX)
1120       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1121 #else
1122       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1123 #endif
1124     }
1125     */
1126     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1127     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1128     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1129   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1130     PetscFunctionReturn(PETSC_SUCCESS);
1131   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1132     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1133     for (i = 0; i < m; i++) {
1134       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1135       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1136       for (j = 0; j < a->rlen[i]; j++) {
1137 #if defined(PETSC_USE_COMPLEX)
1138         if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1139           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1140         } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1141           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1142         } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1143           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1144         }
1145 #else
1146         if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1147 #endif
1148       }
1149       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1150     }
1151     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1152   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1153     PetscInt    cnt = 0, jcnt;
1154     PetscScalar value;
1155 #if defined(PETSC_USE_COMPLEX)
1156     PetscBool realonly = PETSC_TRUE;
1157     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1158       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1159         realonly = PETSC_FALSE;
1160         break;
1161       }
1162     }
1163 #endif
1164 
1165     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1166     for (i = 0; i < m; i++) {
1167       jcnt  = 0;
1168       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1169       for (j = 0; j < A->cmap->n; j++) {
1170         if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1171           value = a->val[cnt++];
1172           jcnt++;
1173         } else {
1174           value = 0.0;
1175         }
1176 #if defined(PETSC_USE_COMPLEX)
1177         if (realonly) {
1178           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1179         } else {
1180           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1181         }
1182 #else
1183         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1184 #endif
1185       }
1186       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1187     }
1188     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1189   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1190     PetscInt fshift = 1;
1191     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1192 #if defined(PETSC_USE_COMPLEX)
1193     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1194 #else
1195     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1196 #endif
1197     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1198     for (i = 0; i < m; i++) {
1199       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1200       for (j = 0; j < a->rlen[i]; j++) {
1201 #if defined(PETSC_USE_COMPLEX)
1202         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1203 #else
1204         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1205 #endif
1206       }
1207     }
1208     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1209   } else if (format == PETSC_VIEWER_NATIVE) {
1210     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1211       PetscInt row;
1212       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1213       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1214 #if defined(PETSC_USE_COMPLEX)
1215         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1216           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1217         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1218           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1219         } else {
1220           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1221         }
1222 #else
1223         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1224 #endif
1225       }
1226     }
1227   } else {
1228     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1229     if (A->factortype) {
1230       for (i = 0; i < m; i++) {
1231         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1232         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1233         /* L part */
1234         for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1235 #if defined(PETSC_USE_COMPLEX)
1236           if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1237             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1238           } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1239             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1240           } else {
1241             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1242           }
1243 #else
1244           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1245 #endif
1246         }
1247         /* diagonal */
1248         j = a->diag[i];
1249 #if defined(PETSC_USE_COMPLEX)
1250         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1251           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1252         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1253           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1254         } else {
1255           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1256         }
1257 #else
1258         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1 / a->val[j])));
1259 #endif
1260 
1261         /* U part */
1262         for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1263 #if defined(PETSC_USE_COMPLEX)
1264           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1265             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1266           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1267             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1268           } else {
1269             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1270           }
1271 #else
1272           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1273 #endif
1274         }
1275         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1276       }
1277     } else {
1278       for (i = 0; i < m; i++) {
1279         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1280         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1281         for (j = 0; j < a->rlen[i]; j++) {
1282 #if defined(PETSC_USE_COMPLEX)
1283           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1284             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1285           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1286             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1287           } else {
1288             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1289           }
1290 #else
1291           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1292 #endif
1293         }
1294         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1295       }
1296     }
1297     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1298   }
1299   PetscCall(PetscViewerFlush(viewer));
1300   PetscFunctionReturn(PETSC_SUCCESS);
1301 }
1302 
1303 #include <petscdraw.h>
MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void * Aa)1304 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1305 {
1306   Mat               A = (Mat)Aa;
1307   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1308   PetscInt          i, j, m = A->rmap->n, shift;
1309   int               color;
1310   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1311   PetscViewer       viewer;
1312   PetscViewerFormat format;
1313 
1314   PetscFunctionBegin;
1315   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1316   PetscCall(PetscViewerGetFormat(viewer, &format));
1317   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1318 
1319   /* loop over matrix elements drawing boxes */
1320 
1321   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1322     PetscDrawCollectiveBegin(draw);
1323     /* Blue for negative, Cyan for zero and  Red for positive */
1324     color = PETSC_DRAW_BLUE;
1325     for (i = 0; i < m; i++) {
1326       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1327       y_l   = m - i - 1.0;
1328       y_r   = y_l + 1.0;
1329       for (j = 0; j < a->rlen[i]; j++) {
1330         x_l = a->colidx[shift + a->sliceheight * j];
1331         x_r = x_l + 1.0;
1332         if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1333         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1334       }
1335     }
1336     color = PETSC_DRAW_CYAN;
1337     for (i = 0; i < m; i++) {
1338       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1339       y_l   = m - i - 1.0;
1340       y_r   = y_l + 1.0;
1341       for (j = 0; j < a->rlen[i]; j++) {
1342         x_l = a->colidx[shift + a->sliceheight * j];
1343         x_r = x_l + 1.0;
1344         if (a->val[shift + a->sliceheight * j] != 0.) continue;
1345         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1346       }
1347     }
1348     color = PETSC_DRAW_RED;
1349     for (i = 0; i < m; i++) {
1350       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1351       y_l   = m - i - 1.0;
1352       y_r   = y_l + 1.0;
1353       for (j = 0; j < a->rlen[i]; j++) {
1354         x_l = a->colidx[shift + a->sliceheight * j];
1355         x_r = x_l + 1.0;
1356         if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1357         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1358       }
1359     }
1360     PetscDrawCollectiveEnd(draw);
1361   } else {
1362     /* use contour shading to indicate magnitude of values */
1363     /* first determine max of all nonzero values */
1364     PetscReal minv = 0.0, maxv = 0.0;
1365     PetscInt  count = 0;
1366     PetscDraw popup;
1367     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1368       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1369     }
1370     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1371     PetscCall(PetscDrawGetPopup(draw, &popup));
1372     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1373 
1374     PetscDrawCollectiveBegin(draw);
1375     for (i = 0; i < m; i++) {
1376       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1377       y_l   = m - i - 1.0;
1378       y_r   = y_l + 1.0;
1379       for (j = 0; j < a->rlen[i]; j++) {
1380         x_l   = a->colidx[shift + a->sliceheight * j];
1381         x_r   = x_l + 1.0;
1382         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1383         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1384         count++;
1385       }
1386     }
1387     PetscDrawCollectiveEnd(draw);
1388   }
1389   PetscFunctionReturn(PETSC_SUCCESS);
1390 }
1391 
1392 #include <petscdraw.h>
MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)1393 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1394 {
1395   PetscDraw draw;
1396   PetscReal xr, yr, xl, yl, h, w;
1397   PetscBool isnull;
1398 
1399   PetscFunctionBegin;
1400   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1401   PetscCall(PetscDrawIsNull(draw, &isnull));
1402   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1403 
1404   xr = A->cmap->n;
1405   yr = A->rmap->n;
1406   h  = yr / 10.0;
1407   w  = xr / 10.0;
1408   xr += w;
1409   yr += h;
1410   xl = -w;
1411   yl = -h;
1412   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1413   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1414   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1415   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1416   PetscCall(PetscDrawSave(draw));
1417   PetscFunctionReturn(PETSC_SUCCESS);
1418 }
1419 
MatView_SeqSELL(Mat A,PetscViewer viewer)1420 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1421 {
1422   PetscBool isascii, isbinary, isdraw;
1423 
1424   PetscFunctionBegin;
1425   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
1426   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1427   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1428   if (isascii) {
1429     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1430   } else if (isbinary) {
1431     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1432   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1433   PetscFunctionReturn(PETSC_SUCCESS);
1434 }
1435 
MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)1436 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1437 {
1438   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1439   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1440   MatScalar   *vp;
1441 #if defined(PETSC_HAVE_CUPM)
1442   PetscInt totalchunks = 0;
1443 #endif
1444 
1445   PetscFunctionBegin;
1446   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1447   /* To do: compress out the unused elements */
1448   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1449   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1450   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1451   a->nonzerorowcnt = 0;
1452   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1453   for (i = 0; i < a->totalslices; ++i) {
1454     shift = a->sliidx[i];                                                   /* starting index of the slice */
1455     cp    = PetscSafePointerPlusOffset(a->colidx, shift);                   /* pointer to the column indices of the slice */
1456     vp    = PetscSafePointerPlusOffset(a->val, shift);                      /* pointer to the nonzero values of the slice */
1457     for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1458       row  = a->sliceheight * i + row_in_slice;
1459       nrow = a->rlen[row]; /* number of nonzeros in row */
1460       /*
1461         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1462         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1463       */
1464       lastcol = 0;
1465       if (nrow > 0) { /* nonempty row */
1466         a->nonzerorowcnt++;
1467         lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1468       } else if (!row_in_slice) {                                 /* first row of the correct slice is empty */
1469         for (j = 1; j < a->sliceheight; j++) {
1470           if (a->rlen[a->sliceheight * i + j]) {
1471             lastcol = cp[j];
1472             break;
1473           }
1474         }
1475       } else {
1476         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1477       }
1478 
1479       for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1480         cp[a->sliceheight * k + row_in_slice] = lastcol;
1481         vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1482       }
1483     }
1484   }
1485 
1486   A->info.mallocs += a->reallocs;
1487   a->reallocs = 0;
1488 
1489 #if defined(PETSC_HAVE_CUPM)
1490   if (!a->chunksize && a->totalslices) {
1491     a->chunksize = 64;
1492     while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2;
1493     totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize;
1494   }
1495   if (totalchunks != a->totalchunks) {
1496     PetscCall(PetscFree(a->chunk_slice_map));
1497     PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map));
1498     a->totalchunks = totalchunks;
1499   }
1500   j = 0;
1501   for (i = 0; i < totalchunks; i++) {
1502     while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++;
1503     a->chunk_slice_map[i] = j;
1504   }
1505 #endif
1506   PetscFunctionReturn(PETSC_SUCCESS);
1507 }
1508 
MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo * info)1509 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1510 {
1511   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1512 
1513   PetscFunctionBegin;
1514   info->block_size   = 1.0;
1515   info->nz_allocated = a->maxallocmat;
1516   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1517   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1518   info->assemblies   = A->num_ass;
1519   info->mallocs      = A->info.mallocs;
1520   info->memory       = 0; /* REVIEW ME */
1521   if (A->factortype) {
1522     info->fill_ratio_given  = A->info.fill_ratio_given;
1523     info->fill_ratio_needed = A->info.fill_ratio_needed;
1524     info->factor_mallocs    = A->info.factor_mallocs;
1525   } else {
1526     info->fill_ratio_given  = 0;
1527     info->fill_ratio_needed = 0;
1528     info->factor_mallocs    = 0;
1529   }
1530   PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532 
MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)1533 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1534 {
1535   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1536   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1537   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1538   MatScalar   *vp, value;
1539 #if defined(PETSC_HAVE_CUPM)
1540   PetscBool inserted = PETSC_FALSE;
1541   PetscInt  mul      = DEVICE_MEM_ALIGN / a->sliceheight;
1542 #endif
1543 
1544   PetscFunctionBegin;
1545   for (k = 0; k < m; k++) { /* loop over added rows */
1546     row = im[k];
1547     if (row < 0) continue;
1548     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1549     shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1550     cp    = a->colidx + shift;                                      /* pointer to the row */
1551     vp    = a->val + shift;                                         /* pointer to the row */
1552     nrow  = a->rlen[row];
1553     low   = 0;
1554     high  = nrow;
1555 
1556     for (l = 0; l < n; l++) { /* loop over added columns */
1557       col = in[l];
1558       if (col < 0) continue;
1559       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1560       if (a->roworiented) {
1561         value = v[l + k * n];
1562       } else {
1563         value = v[k + l * m];
1564       }
1565       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1566 
1567       /* search in this row for the specified column, i indicates the column to be set */
1568       if (col <= lastcol) low = 0;
1569       else high = nrow;
1570       lastcol = col;
1571       while (high - low > 5) {
1572         t = (low + high) / 2;
1573         if (*(cp + a->sliceheight * t) > col) high = t;
1574         else low = t;
1575       }
1576       for (i = low; i < high; i++) {
1577         if (*(cp + a->sliceheight * i) > col) break;
1578         if (*(cp + a->sliceheight * i) == col) {
1579           if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1580           else *(vp + a->sliceheight * i) = value;
1581 #if defined(PETSC_HAVE_CUPM)
1582           inserted = PETSC_TRUE;
1583 #endif
1584           low = i + 1;
1585           goto noinsert;
1586         }
1587       }
1588       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1589       if (nonew == 1) goto noinsert;
1590       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1591 #if defined(PETSC_HAVE_CUPM)
1592       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1593 #else
1594       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1595       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1596 #endif
1597       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1598       for (ii = nrow - 1; ii >= i; ii--) {
1599         *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1600         *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1601       }
1602       a->rlen[row]++;
1603       *(cp + a->sliceheight * i) = col;
1604       *(vp + a->sliceheight * i) = value;
1605       a->nz++;
1606 #if defined(PETSC_HAVE_CUPM)
1607       inserted = PETSC_TRUE;
1608 #endif
1609       low = i + 1;
1610       high++;
1611       nrow++;
1612     noinsert:;
1613     }
1614     a->rlen[row] = nrow;
1615   }
1616 #if defined(PETSC_HAVE_CUPM)
1617   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1618 #endif
1619   PetscFunctionReturn(PETSC_SUCCESS);
1620 }
1621 
MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)1622 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1623 {
1624   PetscFunctionBegin;
1625   /* If the two matrices have the same copy implementation, use fast copy. */
1626   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1627     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1628     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1629 
1630     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1631     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1632   } else {
1633     PetscCall(MatCopy_Basic(A, B, str));
1634   }
1635   PetscFunctionReturn(PETSC_SUCCESS);
1636 }
1637 
MatSetUp_SeqSELL(Mat A)1638 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1639 {
1640   PetscFunctionBegin;
1641   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1642   PetscFunctionReturn(PETSC_SUCCESS);
1643 }
1644 
MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar * array[])1645 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1646 {
1647   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1648 
1649   PetscFunctionBegin;
1650   *array = a->val;
1651   PetscFunctionReturn(PETSC_SUCCESS);
1652 }
1653 
MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar * array[])1654 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1655 {
1656   PetscFunctionBegin;
1657   PetscFunctionReturn(PETSC_SUCCESS);
1658 }
1659 
MatScale_SeqSELL(Mat inA,PetscScalar alpha)1660 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1661 {
1662   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1663   MatScalar   *aval   = a->val;
1664   PetscScalar  oalpha = alpha;
1665   PetscBLASInt one    = 1, size;
1666 
1667   PetscFunctionBegin;
1668   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1669   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1670   PetscCall(PetscLogFlops(a->nz));
1671 #if defined(PETSC_HAVE_CUPM)
1672   if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1673 #endif
1674   PetscFunctionReturn(PETSC_SUCCESS);
1675 }
1676 
MatShift_SeqSELL(Mat Y,PetscScalar a)1677 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1678 {
1679   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1680 
1681   PetscFunctionBegin;
1682   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1683   PetscCall(MatShift_Basic(Y, a));
1684   PetscFunctionReturn(PETSC_SUCCESS);
1685 }
1686 
MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)1687 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1688 {
1689   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1690   PetscScalar       *x, sum, *t;
1691   const MatScalar   *idiag = NULL, *mdiag;
1692   const PetscScalar *b, *xb;
1693   PetscInt           n, m = A->rmap->n, i, j, shift;
1694   const PetscInt    *diag;
1695 
1696   PetscFunctionBegin;
1697   its = its * lits;
1698 
1699   PetscCall(MatInvertDiagonalForSOR_SeqSELL(A, omega, fshift));
1700   diag  = a->diag;
1701   t     = a->ssor_work;
1702   idiag = a->idiag;
1703   mdiag = a->mdiag;
1704 
1705   PetscCall(VecGetArray(xx, &x));
1706   PetscCall(VecGetArrayRead(bb, &b));
1707   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1708   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1709   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1710   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1711 
1712   if (flag & SOR_ZERO_INITIAL_GUESS) {
1713     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1714       for (i = 0; i < m; i++) {
1715         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1716         sum   = b[i];
1717         n     = (diag[i] - shift) / a->sliceheight;
1718         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1719         t[i] = sum;
1720         x[i] = sum * idiag[i];
1721       }
1722       xb = t;
1723       PetscCall(PetscLogFlops(a->nz));
1724     } else xb = b;
1725     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1726       for (i = m - 1; i >= 0; i--) {
1727         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1728         sum   = xb[i];
1729         n     = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1730         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1731         if (xb == b) {
1732           x[i] = sum * idiag[i];
1733         } else {
1734           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1735         }
1736       }
1737       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1738     }
1739     its--;
1740   }
1741   while (its--) {
1742     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1743       for (i = 0; i < m; i++) {
1744         /* lower */
1745         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1746         sum   = b[i];
1747         n     = (diag[i] - shift) / a->sliceheight;
1748         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1749         t[i] = sum; /* save application of the lower-triangular part */
1750         /* upper */
1751         n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1752         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1753         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1754       }
1755       xb = t;
1756       PetscCall(PetscLogFlops(2.0 * a->nz));
1757     } else xb = b;
1758     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1759       for (i = m - 1; i >= 0; i--) {
1760         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1761         sum   = xb[i];
1762         if (xb == b) {
1763           /* whole matrix (no checkpointing available) */
1764           n = a->rlen[i];
1765           for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1766           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1767         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1768           n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1769           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1770           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1771         }
1772       }
1773       if (xb == b) {
1774         PetscCall(PetscLogFlops(2.0 * a->nz));
1775       } else {
1776         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1777       }
1778     }
1779   }
1780   PetscCall(VecRestoreArray(xx, &x));
1781   PetscCall(VecRestoreArrayRead(bb, &b));
1782   PetscFunctionReturn(PETSC_SUCCESS);
1783 }
1784 
1785 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1786                                        MatGetRow_SeqSELL,
1787                                        MatRestoreRow_SeqSELL,
1788                                        MatMult_SeqSELL,
1789                                        /* 4*/ MatMultAdd_SeqSELL,
1790                                        MatMultTranspose_SeqSELL,
1791                                        MatMultTransposeAdd_SeqSELL,
1792                                        NULL,
1793                                        NULL,
1794                                        NULL,
1795                                        /* 10*/ NULL,
1796                                        NULL,
1797                                        NULL,
1798                                        MatSOR_SeqSELL,
1799                                        NULL,
1800                                        /* 15*/ MatGetInfo_SeqSELL,
1801                                        MatEqual_SeqSELL,
1802                                        MatGetDiagonal_SeqSELL,
1803                                        MatDiagonalScale_SeqSELL,
1804                                        NULL,
1805                                        /* 20*/ NULL,
1806                                        MatAssemblyEnd_SeqSELL,
1807                                        MatSetOption_SeqSELL,
1808                                        MatZeroEntries_SeqSELL,
1809                                        /* 24*/ NULL,
1810                                        NULL,
1811                                        NULL,
1812                                        NULL,
1813                                        NULL,
1814                                        /* 29*/ MatSetUp_SeqSELL,
1815                                        NULL,
1816                                        NULL,
1817                                        NULL,
1818                                        NULL,
1819                                        /* 34*/ MatDuplicate_SeqSELL,
1820                                        NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        NULL,
1824                                        /* 39*/ NULL,
1825                                        NULL,
1826                                        NULL,
1827                                        MatGetValues_SeqSELL,
1828                                        MatCopy_SeqSELL,
1829                                        /* 44*/ NULL,
1830                                        MatScale_SeqSELL,
1831                                        MatShift_SeqSELL,
1832                                        NULL,
1833                                        NULL,
1834                                        /* 49*/ NULL,
1835                                        NULL,
1836                                        NULL,
1837                                        NULL,
1838                                        NULL,
1839                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1840                                        NULL,
1841                                        NULL,
1842                                        NULL,
1843                                        NULL,
1844                                        /* 59*/ NULL,
1845                                        MatDestroy_SeqSELL,
1846                                        MatView_SeqSELL,
1847                                        NULL,
1848                                        NULL,
1849                                        /* 64*/ NULL,
1850                                        NULL,
1851                                        NULL,
1852                                        NULL,
1853                                        NULL,
1854                                        /* 69*/ NULL,
1855                                        NULL,
1856                                        NULL,
1857                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1858                                        NULL,
1859                                        /* 74*/ NULL,
1860                                        NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        NULL,
1864                                        /* 79*/ NULL,
1865                                        NULL,
1866                                        NULL,
1867                                        NULL,
1868                                        NULL,
1869                                        /* 84*/ NULL,
1870                                        NULL,
1871                                        NULL,
1872                                        NULL,
1873                                        NULL,
1874                                        /* 89*/ NULL,
1875                                        NULL,
1876                                        NULL,
1877                                        NULL,
1878                                        MatConjugate_SeqSELL,
1879                                        /* 94*/ NULL,
1880                                        NULL,
1881                                        NULL,
1882                                        NULL,
1883                                        NULL,
1884                                        /* 99*/ NULL,
1885                                        NULL,
1886                                        NULL,
1887                                        NULL,
1888                                        NULL,
1889                                        /*104*/ NULL,
1890                                        NULL,
1891                                        NULL,
1892                                        NULL,
1893                                        NULL,
1894                                        /*109*/ NULL,
1895                                        NULL,
1896                                        NULL,
1897                                        NULL,
1898                                        NULL,
1899                                        /*114*/ NULL,
1900                                        NULL,
1901                                        NULL,
1902                                        NULL,
1903                                        NULL,
1904                                        /*119*/ NULL,
1905                                        NULL,
1906                                        NULL,
1907                                        NULL,
1908                                        NULL,
1909                                        /*124*/ NULL,
1910                                        NULL,
1911                                        NULL,
1912                                        NULL,
1913                                        MatFDColoringSetUp_SeqXAIJ,
1914                                        /*129*/ NULL,
1915                                        NULL,
1916                                        NULL,
1917                                        NULL,
1918                                        NULL,
1919                                        /*134*/ NULL,
1920                                        NULL,
1921                                        NULL,
1922                                        NULL,
1923                                        NULL,
1924                                        /*139*/ NULL,
1925                                        NULL,
1926                                        NULL,
1927                                        NULL,
1928                                        NULL};
1929 
MatStoreValues_SeqSELL(Mat mat)1930 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1931 {
1932   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1933 
1934   PetscFunctionBegin;
1935   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1936 
1937   /* allocate space for values if not already there */
1938   if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1939 
1940   /* copy values over */
1941   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1942   PetscFunctionReturn(PETSC_SUCCESS);
1943 }
1944 
MatRetrieveValues_SeqSELL(Mat mat)1945 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1946 {
1947   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1948 
1949   PetscFunctionBegin;
1950   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1951   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1952   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1953   PetscFunctionReturn(PETSC_SUCCESS);
1954 }
1955 
MatSeqSELLGetFillRatio_SeqSELL(Mat mat,PetscReal * ratio)1956 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
1957 {
1958   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1959 
1960   PetscFunctionBegin;
1961   if (a->totalslices && a->sliidx[a->totalslices]) {
1962     *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
1963   } else {
1964     *ratio = 0.0;
1965   }
1966   PetscFunctionReturn(PETSC_SUCCESS);
1967 }
1968 
MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat,PetscInt * slicewidth)1969 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
1970 {
1971   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1972   PetscInt     i, current_slicewidth;
1973 
1974   PetscFunctionBegin;
1975   *slicewidth = 0;
1976   for (i = 0; i < a->totalslices; i++) {
1977     current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
1978     if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
1979   }
1980   PetscFunctionReturn(PETSC_SUCCESS);
1981 }
1982 
MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat,PetscReal * slicewidth)1983 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
1984 {
1985   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1986 
1987   PetscFunctionBegin;
1988   *slicewidth = 0;
1989   if (a->totalslices) *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices;
1990   PetscFunctionReturn(PETSC_SUCCESS);
1991 }
1992 
MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat,PetscReal * variance)1993 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance)
1994 {
1995   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1996   PetscReal    mean;
1997   PetscInt     i, totalslices = a->totalslices, *sliidx = a->sliidx;
1998 
1999   PetscFunctionBegin;
2000   *variance = 0;
2001   if (totalslices) {
2002     mean = (PetscReal)sliidx[totalslices] / totalslices;
2003     for (i = 1; i <= totalslices; i++) *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices;
2004   }
2005   PetscFunctionReturn(PETSC_SUCCESS);
2006 }
2007 
MatSeqSELLSetSliceHeight_SeqSELL(Mat A,PetscInt sliceheight)2008 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2009 {
2010   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2011 
2012   PetscFunctionBegin;
2013   if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2014   PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2015   a->sliceheight = sliceheight;
2016 #if defined(PETSC_HAVE_CUPM)
2017   PetscCheck(PetscMax(DEVICE_MEM_ALIGN, sliceheight) % PetscMin(DEVICE_MEM_ALIGN, sliceheight) == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The slice height is not compatible with DEVICE_MEM_ALIGN (one must be divisible by the other) %" PetscInt_FMT, sliceheight);
2018 #endif
2019   PetscFunctionReturn(PETSC_SUCCESS);
2020 }
2021 
2022 /*@
2023   MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2024 
2025   Not Collective
2026 
2027   Input Parameter:
2028 . A - a MATSEQSELL matrix
2029 
2030   Output Parameter:
2031 . ratio - ratio of number of padded zeros to number of allocated elements
2032 
2033   Level: intermediate
2034 
2035 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2036 @*/
MatSeqSELLGetFillRatio(Mat A,PetscReal * ratio)2037 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2038 {
2039   PetscFunctionBegin;
2040   PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2041   PetscFunctionReturn(PETSC_SUCCESS);
2042 }
2043 
2044 /*@
2045   MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2046 
2047   Not Collective
2048 
2049   Input Parameter:
2050 . A - a MATSEQSELL matrix
2051 
2052   Output Parameter:
2053 . slicewidth - maximum slice width
2054 
2055   Level: intermediate
2056 
2057 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2058 @*/
MatSeqSELLGetMaxSliceWidth(Mat A,PetscInt * slicewidth)2059 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2060 {
2061   PetscFunctionBegin;
2062   PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2063   PetscFunctionReturn(PETSC_SUCCESS);
2064 }
2065 
2066 /*@
2067   MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2068 
2069   Not Collective
2070 
2071   Input Parameter:
2072 . A - a MATSEQSELL matrix
2073 
2074   Output Parameter:
2075 . slicewidth - average slice width
2076 
2077   Level: intermediate
2078 
2079 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()`
2080 @*/
MatSeqSELLGetAvgSliceWidth(Mat A,PetscReal * slicewidth)2081 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2082 {
2083   PetscFunctionBegin;
2084   PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2085   PetscFunctionReturn(PETSC_SUCCESS);
2086 }
2087 
2088 /*@
2089   MatSeqSELLSetSliceHeight - sets the slice height.
2090 
2091   Not Collective
2092 
2093   Input Parameters:
2094 + A           - a MATSEQSELL matrix
2095 - sliceheight - slice height
2096 
2097   Notes:
2098   You cannot change the slice height once it have been set.
2099 
2100   The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2101 
2102   Level: intermediate
2103 
2104 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()`
2105 @*/
MatSeqSELLSetSliceHeight(Mat A,PetscInt sliceheight)2106 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2107 {
2108   PetscFunctionBegin;
2109   PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2110   PetscFunctionReturn(PETSC_SUCCESS);
2111 }
2112 
2113 /*@
2114   MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2115 
2116   Not Collective
2117 
2118   Input Parameter:
2119 . A - a MATSEQSELL matrix
2120 
2121   Output Parameter:
2122 . variance - variance of the slice size
2123 
2124   Level: intermediate
2125 
2126 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()`
2127 @*/
MatSeqSELLGetVarSliceSize(Mat A,PetscReal * variance)2128 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2129 {
2130   PetscFunctionBegin;
2131   PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2132   PetscFunctionReturn(PETSC_SUCCESS);
2133 }
2134 
2135 #if defined(PETSC_HAVE_CUDA)
2136 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2137 #endif
2138 #if defined(PETSC_HAVE_HIP)
2139 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat);
2140 #endif
2141 
MatCreate_SeqSELL(Mat B)2142 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2143 {
2144   Mat_SeqSELL *b;
2145   PetscMPIInt  size;
2146 
2147   PetscFunctionBegin;
2148   PetscCall(PetscCitationsRegister(citation, &cited));
2149   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2150   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2151 
2152   PetscCall(PetscNew(&b));
2153 
2154   B->data   = (void *)b;
2155   B->ops[0] = MatOps_Values;
2156 
2157   b->row                = NULL;
2158   b->col                = NULL;
2159   b->icol               = NULL;
2160   b->reallocs           = 0;
2161   b->ignorezeroentries  = PETSC_FALSE;
2162   b->roworiented        = PETSC_TRUE;
2163   b->nonew              = 0;
2164   b->diag               = NULL;
2165   b->solve_work         = NULL;
2166   B->spptr              = NULL;
2167   b->saved_values       = NULL;
2168   b->idiag              = NULL;
2169   b->mdiag              = NULL;
2170   b->ssor_work          = NULL;
2171   b->omega              = 1.0;
2172   b->fshift             = 0.0;
2173   b->keepnonzeropattern = PETSC_FALSE;
2174   b->sliceheight        = 0;
2175 
2176   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2177   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2178   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2179   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2180   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2181   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2182   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
2183 #if defined(PETSC_HAVE_CUDA)
2184   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA));
2185 #endif
2186 #if defined(PETSC_HAVE_HIP)
2187   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellhip_C", MatConvert_SeqSELL_SeqSELLHIP));
2188 #endif
2189   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2190   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2191   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2192   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL));
2193   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2194 
2195   PetscObjectOptionsBegin((PetscObject)B);
2196   {
2197     PetscInt  newsh = -1;
2198     PetscBool flg;
2199 #if defined(PETSC_HAVE_CUPM)
2200     PetscInt chunksize = 0;
2201 #endif
2202 
2203     PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2204     if (flg) PetscCall(MatSeqSELLSetSliceHeight(B, newsh));
2205 #if defined(PETSC_HAVE_CUPM)
2206     PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA/HIP kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg));
2207     if (flg) {
2208       PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize);
2209       b->chunksize = chunksize;
2210     }
2211 #endif
2212   }
2213   PetscOptionsEnd();
2214   PetscFunctionReturn(PETSC_SUCCESS);
2215 }
2216 
2217 /*
2218  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2219  */
MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)2220 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2221 {
2222   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2223   PetscInt     i, m                           = A->rmap->n;
2224   PetscInt     totalslices = a->totalslices;
2225 
2226   PetscFunctionBegin;
2227   C->factortype = A->factortype;
2228   c->row        = NULL;
2229   c->col        = NULL;
2230   c->icol       = NULL;
2231   c->reallocs   = 0;
2232   C->assembled  = PETSC_TRUE;
2233 
2234   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2235   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2236 
2237   c->sliceheight = a->sliceheight;
2238   PetscCall(PetscMalloc1(c->sliceheight * totalslices, &c->rlen));
2239   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2240 
2241   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2242   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2243 
2244   /* allocate the matrix space */
2245   if (mallocmatspace) {
2246     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2247 
2248     c->singlemalloc = PETSC_TRUE;
2249 
2250     if (m > 0) {
2251       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2252       if (cpvalues == MAT_COPY_VALUES) {
2253         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2254       } else {
2255         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2256       }
2257     }
2258   }
2259 
2260   c->ignorezeroentries  = a->ignorezeroentries;
2261   c->roworiented        = a->roworiented;
2262   c->nonew              = a->nonew;
2263   c->solve_work         = NULL;
2264   c->saved_values       = NULL;
2265   c->idiag              = NULL;
2266   c->ssor_work          = NULL;
2267   c->keepnonzeropattern = a->keepnonzeropattern;
2268   c->free_val           = PETSC_TRUE;
2269   c->free_colidx        = PETSC_TRUE;
2270 
2271   c->maxallocmat  = a->maxallocmat;
2272   c->maxallocrow  = a->maxallocrow;
2273   c->rlenmax      = a->rlenmax;
2274   c->nz           = a->nz;
2275   C->preallocated = PETSC_TRUE;
2276 
2277   c->nonzerorowcnt = a->nonzerorowcnt;
2278   C->nonzerostate  = A->nonzerostate;
2279 
2280   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2281   PetscFunctionReturn(PETSC_SUCCESS);
2282 }
2283 
MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat * B)2284 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2285 {
2286   PetscFunctionBegin;
2287   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2288   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2289   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2290   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2291   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2292   PetscFunctionReturn(PETSC_SUCCESS);
2293 }
2294 
2295 /*MC
2296    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2297    based on the sliced Ellpack format, {cite}`zhangellpack2018`
2298 
2299    Options Database Key:
2300 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2301 
2302    Level: beginner
2303 
2304 .seealso: `Mat`, `MatCreateSeqSELL()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2305 M*/
2306 
2307 /*MC
2308    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018`
2309 
2310    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2311    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2312   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2313   for communicators controlling multiple processes.  It is recommended that you call both of
2314   the above preallocation routines for simplicity.
2315 
2316    Options Database Key:
2317 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2318 
2319   Level: beginner
2320 
2321   Notes:
2322   This format is only supported for real scalars, double precision, and 32-bit indices (the defaults).
2323 
2324   It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2325   non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2326 
2327   Developer Notes:
2328   On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2329 
2330   The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2331 .vb
2332                             (2 0  3 4)
2333    Consider the matrix A =  (5 0  6 0)
2334                             (0 0  7 8)
2335                             (0 0  9 9)
2336 
2337    symbolically the Ellpack format can be written as
2338 
2339         (2 3 4 |)           (0 2 3 |)
2340    v =  (5 6 0 |)  colidx = (0 2 2 |)
2341         --------            ---------
2342         (7 8 |)             (2 3 |)
2343         (9 9 |)             (2 3 |)
2344 
2345     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2346     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2347     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2348 
2349     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2350 
2351 .ve
2352 
2353     See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance.
2354 
2355 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSELL()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2356 M*/
2357 
2358 /*@
2359   MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2360 
2361   Collective
2362 
2363   Input Parameters:
2364 + comm    - MPI communicator, set to `PETSC_COMM_SELF`
2365 . m       - number of rows
2366 . n       - number of columns
2367 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2368 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2369 
2370   Output Parameter:
2371 . A - the matrix
2372 
2373   Level: intermediate
2374 
2375   Notes:
2376   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2377   MatXXXXSetPreallocation() paradigm instead of this routine directly.
2378   [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2379 
2380   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2381   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2382   allocation.
2383 
2384 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL`
2385  @*/
MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt rlenmax,const PetscInt rlen[],Mat * A)2386 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2387 {
2388   PetscFunctionBegin;
2389   PetscCall(MatCreate(comm, A));
2390   PetscCall(MatSetSizes(*A, m, n, m, n));
2391   PetscCall(MatSetType(*A, MATSEQSELL));
2392   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2393   PetscFunctionReturn(PETSC_SUCCESS);
2394 }
2395 
MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)2396 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2397 {
2398   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2399   PetscInt     totalslices = a->totalslices;
2400 
2401   PetscFunctionBegin;
2402   /* If the  matrix dimensions are not equal,or no of nonzeros */
2403   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2404     *flg = PETSC_FALSE;
2405     PetscFunctionReturn(PETSC_SUCCESS);
2406   }
2407   /* if the a->colidx are the same */
2408   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2409   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2410   /* if a->val are the same */
2411   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2412   PetscFunctionReturn(PETSC_SUCCESS);
2413 }
2414 
MatConjugate_SeqSELL(Mat A)2415 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2416 {
2417   Mat_SeqSELL *a   = (Mat_SeqSELL *)A->data;
2418   PetscScalar *val = a->val;
2419 
2420   PetscFunctionBegin;
2421   for (PetscInt i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2422 #if defined(PETSC_HAVE_CUPM)
2423   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2424 #endif
2425   PetscFunctionReturn(PETSC_SUCCESS);
2426 }
2427