1 /*
2 Defines the basic matrix operations for the SELL matrix storage format.
3 */
4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/
5 #include <petscblaslapack.h>
6 #include <petsc/private/kernels/blocktranspose.h>
7
8 static PetscBool cited = PETSC_FALSE;
9 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
10 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
11 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
12 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
13 " year = 2018\n"
14 "}\n";
15
16 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
17
18 #include <immintrin.h>
19
20 #if !defined(_MM_SCALE_8)
21 #define _MM_SCALE_8 8
22 #endif
23
24 #if defined(__AVX512F__)
25 /* these do not work
26 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
27 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
28 */
29 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
30 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
31 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \
32 vec_vals = _mm512_loadu_pd(aval); \
33 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
34 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
35 #elif defined(__AVX2__) && defined(__FMA__)
36 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
37 vec_vals = _mm256_loadu_pd(aval); \
38 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
39 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
40 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
41 #endif
42 #endif /* PETSC_HAVE_IMMINTRIN_H */
43
44 /*@
45 MatSeqSELLSetPreallocation - For good matrix assembly performance
46 the user should preallocate the matrix storage by setting the parameter `nz`
47 (or the array `nnz`).
48
49 Collective
50
51 Input Parameters:
52 + B - The `MATSEQSELL` matrix
53 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
54 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
55
56 Level: intermediate
57
58 Notes:
59 Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
60 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
61 allocation.
62
63 You can call `MatGetInfo()` to get information on how effective the preallocation was;
64 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
65 You can also run with the option `-info` and look for messages with the string
66 malloc in them to see if additional memory allocation was needed.
67
68 Developer Notes:
69 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
70 entries or columns indices.
71
72 The maximum number of nonzeos in any row should be as accurate as possible.
73 If it is underestimated, you will get bad performance due to reallocation
74 (`MatSeqXSELLReallocateSELL()`).
75
76 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
77 @*/
MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])78 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
79 {
80 PetscFunctionBegin;
81 PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
82 PetscValidType(B, 1);
83 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
84 PetscFunctionReturn(PETSC_SUCCESS);
85 }
86
MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])87 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
88 {
89 Mat_SeqSELL *b;
90 PetscInt i, j, totalslices;
91 #if defined(PETSC_HAVE_CUPM)
92 PetscInt rlenmax = 0;
93 #endif
94 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
95
96 PetscFunctionBegin;
97 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
98 if (maxallocrow == MAT_SKIP_ALLOCATION) {
99 skipallocation = PETSC_TRUE;
100 maxallocrow = 0;
101 }
102
103 PetscCall(PetscLayoutSetUp(B->rmap));
104 PetscCall(PetscLayoutSetUp(B->cmap));
105
106 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
107 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
108 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
109 if (rlen) {
110 for (i = 0; i < B->rmap->n; i++) {
111 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
112 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
113 }
114 }
115
116 B->preallocated = PETSC_TRUE;
117
118 b = (Mat_SeqSELL *)B->data;
119
120 if (!b->sliceheight) { /* not set yet */
121 #if defined(PETSC_HAVE_CUPM)
122 b->sliceheight = 16;
123 #else
124 b->sliceheight = 8;
125 #endif
126 }
127 totalslices = PetscCeilInt(B->rmap->n, b->sliceheight);
128 b->totalslices = totalslices;
129 if (!skipallocation) {
130 if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
131
132 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
133 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
134 }
135 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
136 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
137 else if (maxallocrow < 0) maxallocrow = 1;
138 #if defined(PETSC_HAVE_CUPM)
139 rlenmax = maxallocrow;
140 /* Pad the slice to DEVICE_MEM_ALIGN */
141 while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
142 #endif
143 for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
144 } else {
145 #if defined(PETSC_HAVE_CUPM)
146 PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
147 #endif
148 maxallocrow = 0;
149 b->sliidx[0] = 0;
150 for (i = 1; i < totalslices; i++) {
151 b->sliidx[i] = 0;
152 for (j = 0; j < b->sliceheight; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]);
153 #if defined(PETSC_HAVE_CUPM)
154 if (mul != 0) { /* Pad the slice to DEVICE_MEM_ALIGN if sliceheight < DEVICE_MEM_ALIGN */
155 rlenmax = PetscMax(b->sliidx[i], rlenmax);
156 b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
157 }
158 #endif
159 maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
160 PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
161 }
162 /* last slice */
163 b->sliidx[totalslices] = 0;
164 for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
165 #if defined(PETSC_HAVE_CUPM)
166 if (mul != 0) {
167 rlenmax = PetscMax(b->sliidx[i], rlenmax);
168 b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
169 }
170 #endif
171 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow);
172 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
173 }
174
175 /* allocate space for val, colidx, rlen */
176 /* FIXME: should B's old memory be unlogged? */
177 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
178 /* FIXME: assuming an element of the bit array takes 8 bits */
179 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
180 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
181 PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
182
183 b->singlemalloc = PETSC_TRUE;
184 b->free_val = PETSC_TRUE;
185 b->free_colidx = PETSC_TRUE;
186 } else {
187 b->free_val = PETSC_FALSE;
188 b->free_colidx = PETSC_FALSE;
189 }
190
191 b->nz = 0;
192 b->maxallocrow = maxallocrow;
193 #if defined(PETSC_HAVE_CUPM)
194 b->rlenmax = rlenmax;
195 #else
196 b->rlenmax = maxallocrow;
197 #endif
198 b->maxallocmat = b->sliidx[totalslices];
199 B->info.nz_unneeded = (double)b->maxallocmat;
200 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
201 PetscFunctionReturn(PETSC_SUCCESS);
202 }
203
MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt * nz,PetscInt ** idx,PetscScalar ** v)204 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
205 {
206 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
207 PetscInt shift;
208
209 PetscFunctionBegin;
210 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
211 if (nz) *nz = a->rlen[row];
212 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
213 if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals));
214 if (idx) {
215 PetscInt j;
216 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
217 *idx = a->getrowcols;
218 }
219 if (v) {
220 PetscInt j;
221 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
222 *v = a->getrowvals;
223 }
224 PetscFunctionReturn(PETSC_SUCCESS);
225 }
226
MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt * nz,PetscInt ** idx,PetscScalar ** v)227 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
228 {
229 PetscFunctionBegin;
230 PetscFunctionReturn(PETSC_SUCCESS);
231 }
232
MatConvert_SeqSELL_SeqAIJ(Mat A,MatType newtype,MatReuse reuse,Mat * newmat)233 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
234 {
235 Mat B;
236 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
237 PetscInt i;
238
239 PetscFunctionBegin;
240 if (reuse == MAT_REUSE_MATRIX) {
241 B = *newmat;
242 PetscCall(MatZeroEntries(B));
243 } else {
244 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
245 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
246 PetscCall(MatSetType(B, MATSEQAIJ));
247 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
248 }
249
250 for (i = 0; i < A->rmap->n; i++) {
251 PetscInt nz = 0, *cols = NULL;
252 PetscScalar *vals = NULL;
253
254 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
255 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
256 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
257 }
258
259 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
260 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
261 B->rmap->bs = A->rmap->bs;
262
263 if (reuse == MAT_INPLACE_MATRIX) {
264 PetscCall(MatHeaderReplace(A, &B));
265 } else {
266 *newmat = B;
267 }
268 PetscFunctionReturn(PETSC_SUCCESS);
269 }
270
271 #include <../src/mat/impls/aij/seq/aij.h>
272
MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat * newmat)273 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
274 {
275 Mat B;
276 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
277 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
278 const PetscInt *cols;
279 const PetscScalar *vals;
280
281 PetscFunctionBegin;
282 if (reuse == MAT_REUSE_MATRIX) {
283 B = *newmat;
284 } else {
285 if (PetscDefined(USE_DEBUG) || !a->ilen) {
286 PetscCall(PetscMalloc1(m, &rowlengths));
287 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
288 }
289 if (PetscDefined(USE_DEBUG) && a->ilen) {
290 PetscBool eq;
291 PetscCall(PetscArraycmp(rowlengths, a->ilen, m, &eq));
292 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
293 PetscCall(PetscFree(rowlengths));
294 rowlengths = a->ilen;
295 } else if (a->ilen) rowlengths = a->ilen;
296 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
297 PetscCall(MatSetSizes(B, m, n, m, n));
298 PetscCall(MatSetType(B, MATSEQSELL));
299 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
300 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
301 }
302
303 for (row = 0; row < m; row++) {
304 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
305 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
306 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
307 }
308 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
309 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
310 B->rmap->bs = A->rmap->bs;
311
312 if (reuse == MAT_INPLACE_MATRIX) {
313 PetscCall(MatHeaderReplace(A, &B));
314 } else {
315 *newmat = B;
316 }
317 PetscFunctionReturn(PETSC_SUCCESS);
318 }
319
MatMult_SeqSELL(Mat A,Vec xx,Vec yy)320 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
321 {
322 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
323 PetscScalar *y;
324 const PetscScalar *x;
325 const MatScalar *aval = a->val;
326 PetscInt totalslices = a->totalslices;
327 const PetscInt *acolidx = a->colidx;
328 PetscInt i, j;
329 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
330 __m512d vec_x, vec_y, vec_vals;
331 __m256i vec_idx;
332 __mmask8 mask;
333 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
334 __m256i vec_idx2, vec_idx3, vec_idx4;
335 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
336 __m128i vec_idx;
337 __m256d vec_x, vec_y, vec_y2, vec_vals;
338 MatScalar yval;
339 PetscInt r, rows_left, row, nnz_in_row;
340 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
341 __m128d vec_x_tmp;
342 __m256d vec_x, vec_y, vec_y2, vec_vals;
343 MatScalar yval;
344 PetscInt r, rows_left, row, nnz_in_row;
345 #else
346 PetscInt k, sliceheight = a->sliceheight;
347 PetscScalar *sum;
348 #endif
349
350 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
351 #pragma disjoint(*x, *y, *aval)
352 #endif
353
354 PetscFunctionBegin;
355 PetscCall(VecGetArrayRead(xx, &x));
356 PetscCall(VecGetArray(yy, &y));
357 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
358 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
359 for (i = 0; i < totalslices; i++) { /* loop over slices */
360 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
361 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
362
363 vec_y = _mm512_setzero_pd();
364 vec_y2 = _mm512_setzero_pd();
365 vec_y3 = _mm512_setzero_pd();
366 vec_y4 = _mm512_setzero_pd();
367
368 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
369 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
370 case 3:
371 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
372 acolidx += 8;
373 aval += 8;
374 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
375 acolidx += 8;
376 aval += 8;
377 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
378 acolidx += 8;
379 aval += 8;
380 j += 3;
381 break;
382 case 2:
383 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
384 acolidx += 8;
385 aval += 8;
386 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
387 acolidx += 8;
388 aval += 8;
389 j += 2;
390 break;
391 case 1:
392 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
393 acolidx += 8;
394 aval += 8;
395 j += 1;
396 break;
397 }
398 #pragma novector
399 for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
400 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
401 acolidx += 8;
402 aval += 8;
403 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
404 acolidx += 8;
405 aval += 8;
406 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
407 acolidx += 8;
408 aval += 8;
409 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
410 acolidx += 8;
411 aval += 8;
412 }
413
414 vec_y = _mm512_add_pd(vec_y, vec_y2);
415 vec_y = _mm512_add_pd(vec_y, vec_y3);
416 vec_y = _mm512_add_pd(vec_y, vec_y4);
417 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
418 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
419 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
420 } else {
421 _mm512_storeu_pd(&y[8 * i], vec_y);
422 }
423 }
424 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
425 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
426 for (i = 0; i < totalslices; i++) { /* loop over full slices */
427 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
428 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
429
430 /* last slice may have padding rows. Don't use vectorization. */
431 if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
432 rows_left = A->rmap->n - 8 * i;
433 for (r = 0; r < rows_left; ++r) {
434 yval = (MatScalar)0;
435 row = 8 * i + r;
436 nnz_in_row = a->rlen[row];
437 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
438 y[row] = yval;
439 }
440 break;
441 }
442
443 vec_y = _mm256_setzero_pd();
444 vec_y2 = _mm256_setzero_pd();
445
446 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
447 #pragma novector
448 #pragma unroll(2)
449 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
450 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
451 aval += 4;
452 acolidx += 4;
453 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
454 aval += 4;
455 acolidx += 4;
456 }
457
458 _mm256_storeu_pd(y + i * 8, vec_y);
459 _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
460 }
461 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
462 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
463 for (i = 0; i < totalslices; i++) { /* loop over full slices */
464 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
465 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
466
467 vec_y = _mm256_setzero_pd();
468 vec_y2 = _mm256_setzero_pd();
469
470 /* last slice may have padding rows. Don't use vectorization. */
471 if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
472 rows_left = A->rmap->n - 8 * i;
473 for (r = 0; r < rows_left; ++r) {
474 yval = (MatScalar)0;
475 row = 8 * i + r;
476 nnz_in_row = a->rlen[row];
477 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
478 y[row] = yval;
479 }
480 break;
481 }
482
483 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
484 #pragma novector
485 #pragma unroll(2)
486 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
487 vec_vals = _mm256_loadu_pd(aval);
488 vec_x_tmp = _mm_setzero_pd();
489 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
490 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
491 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
492 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
493 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
494 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
495 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
496 aval += 4;
497
498 vec_vals = _mm256_loadu_pd(aval);
499 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
500 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
501 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
502 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
503 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
504 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
505 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
506 aval += 4;
507 }
508
509 _mm256_storeu_pd(y + i * 8, vec_y);
510 _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
511 }
512 #else
513 PetscCall(PetscMalloc1(sliceheight, &sum));
514 for (i = 0; i < totalslices; i++) { /* loop over slices */
515 for (j = 0; j < sliceheight; j++) {
516 sum[j] = 0.0;
517 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
518 }
519 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
520 for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
521 } else {
522 for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
523 }
524 }
525 PetscCall(PetscFree(sum));
526 #endif
527
528 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
529 PetscCall(VecRestoreArrayRead(xx, &x));
530 PetscCall(VecRestoreArray(yy, &y));
531 PetscFunctionReturn(PETSC_SUCCESS);
532 }
533
534 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)535 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
536 {
537 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
538 PetscScalar *y, *z;
539 const PetscScalar *x;
540 const MatScalar *aval = a->val;
541 PetscInt totalslices = a->totalslices;
542 const PetscInt *acolidx = a->colidx;
543 PetscInt i, j;
544 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
545 __m512d vec_x, vec_y, vec_vals;
546 __m256i vec_idx;
547 __mmask8 mask = 0;
548 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
549 __m256i vec_idx2, vec_idx3, vec_idx4;
550 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
551 __m128d vec_x_tmp;
552 __m256d vec_x, vec_y, vec_y2, vec_vals;
553 MatScalar yval;
554 PetscInt r, row, nnz_in_row;
555 #else
556 PetscInt k, sliceheight = a->sliceheight;
557 PetscScalar *sum;
558 #endif
559
560 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
561 #pragma disjoint(*x, *y, *aval)
562 #endif
563
564 PetscFunctionBegin;
565 if (!a->nz) {
566 PetscCall(VecCopy(yy, zz));
567 PetscFunctionReturn(PETSC_SUCCESS);
568 }
569 PetscCall(VecGetArrayRead(xx, &x));
570 PetscCall(VecGetArrayPair(yy, zz, &y, &z));
571 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
572 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
573 for (i = 0; i < totalslices; i++) { /* loop over slices */
574 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
575 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
576
577 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
578 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
579 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
580 } else {
581 vec_y = _mm512_loadu_pd(&y[8 * i]);
582 }
583 vec_y2 = _mm512_setzero_pd();
584 vec_y3 = _mm512_setzero_pd();
585 vec_y4 = _mm512_setzero_pd();
586
587 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
588 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
589 case 3:
590 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
591 acolidx += 8;
592 aval += 8;
593 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
594 acolidx += 8;
595 aval += 8;
596 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
597 acolidx += 8;
598 aval += 8;
599 j += 3;
600 break;
601 case 2:
602 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
603 acolidx += 8;
604 aval += 8;
605 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
606 acolidx += 8;
607 aval += 8;
608 j += 2;
609 break;
610 case 1:
611 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
612 acolidx += 8;
613 aval += 8;
614 j += 1;
615 break;
616 }
617 #pragma novector
618 for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
619 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
620 acolidx += 8;
621 aval += 8;
622 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
623 acolidx += 8;
624 aval += 8;
625 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
626 acolidx += 8;
627 aval += 8;
628 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
629 acolidx += 8;
630 aval += 8;
631 }
632
633 vec_y = _mm512_add_pd(vec_y, vec_y2);
634 vec_y = _mm512_add_pd(vec_y, vec_y3);
635 vec_y = _mm512_add_pd(vec_y, vec_y4);
636 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
637 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
638 } else {
639 _mm512_storeu_pd(&z[8 * i], vec_y);
640 }
641 }
642 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
643 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
644 for (i = 0; i < totalslices; i++) { /* loop over full slices */
645 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
646 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
647
648 /* last slice may have padding rows. Don't use vectorization. */
649 if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
650 for (r = 0; r < (A->rmap->n & 0x07); ++r) {
651 row = 8 * i + r;
652 yval = (MatScalar)0.0;
653 nnz_in_row = a->rlen[row];
654 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
655 z[row] = y[row] + yval;
656 }
657 break;
658 }
659
660 vec_y = _mm256_loadu_pd(y + 8 * i);
661 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
662
663 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
664 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
665 vec_vals = _mm256_loadu_pd(aval);
666 vec_x_tmp = _mm_setzero_pd();
667 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
668 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
669 vec_x = _mm256_setzero_pd();
670 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
671 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
672 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
673 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
674 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
675 aval += 4;
676
677 vec_vals = _mm256_loadu_pd(aval);
678 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
679 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
680 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
681 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
682 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
683 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
684 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
685 aval += 4;
686 }
687
688 _mm256_storeu_pd(z + i * 8, vec_y);
689 _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
690 }
691 #else
692 PetscCall(PetscMalloc1(sliceheight, &sum));
693 for (i = 0; i < totalslices; i++) { /* loop over slices */
694 for (j = 0; j < sliceheight; j++) {
695 sum[j] = 0.0;
696 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
697 }
698 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
699 for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
700 } else {
701 for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
702 }
703 }
704 PetscCall(PetscFree(sum));
705 #endif
706
707 PetscCall(PetscLogFlops(2.0 * a->nz));
708 PetscCall(VecRestoreArrayRead(xx, &x));
709 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
710 PetscFunctionReturn(PETSC_SUCCESS);
711 }
712
MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)713 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
714 {
715 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
716 PetscScalar *y;
717 const PetscScalar *x;
718 const MatScalar *aval = a->val;
719 const PetscInt *acolidx = a->colidx;
720 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
721
722 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
723 #pragma disjoint(*x, *y, *aval)
724 #endif
725
726 PetscFunctionBegin;
727 if (A->symmetric == PETSC_BOOL3_TRUE) {
728 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
729 PetscFunctionReturn(PETSC_SUCCESS);
730 }
731 if (zz != yy) PetscCall(VecCopy(zz, yy));
732
733 if (a->nz) {
734 PetscCall(VecGetArrayRead(xx, &x));
735 PetscCall(VecGetArray(yy, &y));
736 for (i = 0; i < a->totalslices; i++) { /* loop over slices */
737 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
738 for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
739 row = sliceheight * i + r;
740 nnz_in_row = a->rlen[row];
741 for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
742 }
743 break;
744 }
745 for (r = 0; r < sliceheight; ++r)
746 for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
747 }
748 PetscCall(PetscLogFlops(2.0 * a->nz));
749 PetscCall(VecRestoreArrayRead(xx, &x));
750 PetscCall(VecRestoreArray(yy, &y));
751 }
752 PetscFunctionReturn(PETSC_SUCCESS);
753 }
754
MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)755 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
756 {
757 PetscFunctionBegin;
758 if (A->symmetric == PETSC_BOOL3_TRUE) {
759 PetscCall(MatMult_SeqSELL(A, xx, yy));
760 } else {
761 PetscCall(VecSet(yy, 0.0));
762 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
763 }
764 PetscFunctionReturn(PETSC_SUCCESS);
765 }
766
MatGetDiagonalMarkers_SeqSELL(Mat A,const PetscInt ** diag,PetscBool * diagDense)767 static PetscErrorCode MatGetDiagonalMarkers_SeqSELL(Mat A, const PetscInt **diag, PetscBool *diagDense)
768 {
769 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
770
771 PetscFunctionBegin;
772 if (A->factortype != MAT_FACTOR_NONE) {
773 PetscAssertPointer(diag, 2);
774 PetscCheck(!diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot check for dense diagonal with factored matrices");
775 *diag = a->diag;
776 PetscFunctionReturn(PETSC_SUCCESS);
777 }
778 PetscCheck(diag || diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "At least one of diag or diagDense must be requested");
779 if (a->diagNonzeroState != A->nonzerostate || (diag && !a->diag)) {
780 const PetscInt m = A->rmap->n;
781 PetscInt shift;
782
783 if (!diag && !a->diag) {
784 a->diagDense = PETSC_TRUE;
785 for (PetscInt i = 0; i < m; i++) {
786 PetscBool found = PETSC_FALSE;
787
788 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
789 for (PetscInt j = 0; j < a->rlen[i]; j++) {
790 if (a->colidx[shift + a->sliceheight * j] == i) {
791 a->diag[i] = shift + a->sliceheight * j;
792 found = PETSC_TRUE;
793 break;
794 }
795 }
796 if (!found) {
797 a->diagDense = PETSC_FALSE;
798 *diagDense = a->diagDense;
799 a->diagNonzeroState = A->nonzerostate;
800 PetscFunctionReturn(PETSC_SUCCESS);
801 }
802 }
803 } else {
804 if (!a->diag) PetscCall(PetscMalloc1(m, &a->diag));
805 a->diagDense = PETSC_TRUE;
806 for (PetscInt i = 0; i < m; i++) {
807 PetscBool found = PETSC_FALSE;
808
809 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
810 a->diag[i] = -1;
811 for (PetscInt j = 0; j < a->rlen[i]; j++) {
812 if (a->colidx[shift + a->sliceheight * j] == i) {
813 a->diag[i] = shift + a->sliceheight * j;
814 found = PETSC_TRUE;
815 break;
816 }
817 }
818 if (!found) a->diagDense = PETSC_FALSE;
819 }
820 }
821 a->diagNonzeroState = A->nonzerostate;
822 }
823 if (diag) *diag = a->diag;
824 if (diagDense) *diagDense = a->diagDense;
825 PetscFunctionReturn(PETSC_SUCCESS);
826 }
827
828 /*
829 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
830 */
MatInvertDiagonalForSOR_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)831 static PetscErrorCode MatInvertDiagonalForSOR_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
832 {
833 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
834 PetscInt i, m = A->rmap->n;
835 MatScalar *val = a->val;
836 PetscScalar *idiag, *mdiag;
837 const PetscInt *diag;
838 PetscBool diagDense;
839
840 PetscFunctionBegin;
841 if (a->idiagState == ((PetscObject)A)->state && a->omega == omega && a->fshift == fshift) PetscFunctionReturn(PETSC_SUCCESS);
842 PetscCall(MatGetDiagonalMarkers_SeqSELL(A, &diag, &diagDense));
843 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must have all diagonal locations to invert them");
844
845 if (!a->idiag) {
846 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
847 val = a->val;
848 }
849 mdiag = a->mdiag;
850 idiag = a->idiag;
851
852 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
853 for (i = 0; i < m; i++) {
854 mdiag[i] = val[diag[i]];
855 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
856 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
857 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
858 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
859 A->factorerror_zeropivot_value = 0.0;
860 A->factorerror_zeropivot_row = i;
861 }
862 idiag[i] = 1.0 / val[diag[i]];
863 }
864 PetscCall(PetscLogFlops(m));
865 } else {
866 for (i = 0; i < m; i++) {
867 mdiag[i] = val[diag[i]];
868 idiag[i] = omega / (fshift + val[diag[i]]);
869 }
870 PetscCall(PetscLogFlops(2.0 * m));
871 }
872 a->idiagState = ((PetscObject)A)->state;
873 a->omega = omega;
874 a->fshift = fshift;
875 PetscFunctionReturn(PETSC_SUCCESS);
876 }
877
MatZeroEntries_SeqSELL(Mat A)878 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
879 {
880 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
881
882 PetscFunctionBegin;
883 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
884 PetscFunctionReturn(PETSC_SUCCESS);
885 }
886
MatDestroy_SeqSELL(Mat A)887 PetscErrorCode MatDestroy_SeqSELL(Mat A)
888 {
889 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
890
891 PetscFunctionBegin;
892 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
893 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
894 PetscCall(ISDestroy(&a->row));
895 PetscCall(ISDestroy(&a->col));
896 PetscCall(PetscFree(a->diag));
897 PetscCall(PetscFree(a->rlen));
898 PetscCall(PetscFree(a->sliidx));
899 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
900 PetscCall(PetscFree(a->solve_work));
901 PetscCall(ISDestroy(&a->icol));
902 PetscCall(PetscFree(a->saved_values));
903 PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
904 PetscCall(PetscFree(A->data));
905 #if defined(PETSC_HAVE_CUPM)
906 PetscCall(PetscFree(a->chunk_slice_map));
907 #endif
908
909 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
910 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
911 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
912 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
913 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
914 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
915 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
916 #if defined(PETSC_HAVE_CUDA)
917 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL));
918 #endif
919 #if defined(PETSC_HAVE_HIP)
920 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellhip_C", NULL));
921 #endif
922 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
923 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
924 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
925 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL));
926 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
927 PetscFunctionReturn(PETSC_SUCCESS);
928 }
929
MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)930 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
931 {
932 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
933
934 PetscFunctionBegin;
935 switch (op) {
936 case MAT_ROW_ORIENTED:
937 a->roworiented = flg;
938 break;
939 case MAT_KEEP_NONZERO_PATTERN:
940 a->keepnonzeropattern = flg;
941 break;
942 case MAT_NEW_NONZERO_LOCATIONS:
943 a->nonew = (flg ? 0 : 1);
944 break;
945 case MAT_NEW_NONZERO_LOCATION_ERR:
946 a->nonew = (flg ? -1 : 0);
947 break;
948 case MAT_NEW_NONZERO_ALLOCATION_ERR:
949 a->nonew = (flg ? -2 : 0);
950 break;
951 case MAT_UNUSED_NONZERO_LOCATION_ERR:
952 a->nounused = (flg ? -1 : 0);
953 break;
954 default:
955 break;
956 }
957 PetscFunctionReturn(PETSC_SUCCESS);
958 }
959
MatGetDiagonal_SeqSELL(Mat A,Vec v)960 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
961 {
962 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
963 PetscInt i, j, n, shift;
964 PetscScalar *x, zero = 0.0;
965
966 PetscFunctionBegin;
967 PetscCall(VecGetLocalSize(v, &n));
968 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
969
970 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
971 const PetscInt *diag;
972
973 PetscCall(MatGetDiagonalMarkers_SeqSELL(A, &diag, NULL));
974 PetscCall(VecGetArrayWrite(v, &x));
975 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
976 PetscCall(VecRestoreArrayWrite(v, &x));
977 PetscFunctionReturn(PETSC_SUCCESS);
978 }
979
980 PetscCall(VecSet(v, zero));
981 PetscCall(VecGetArray(v, &x));
982 for (i = 0; i < n; i++) { /* loop over rows */
983 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
984 x[i] = 0;
985 for (j = 0; j < a->rlen[i]; j++) {
986 if (a->colidx[shift + a->sliceheight * j] == i) {
987 x[i] = a->val[shift + a->sliceheight * j];
988 break;
989 }
990 }
991 }
992 PetscCall(VecRestoreArray(v, &x));
993 PetscFunctionReturn(PETSC_SUCCESS);
994 }
995
MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)996 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
997 {
998 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
999 const PetscScalar *l, *r;
1000 PetscInt i, j, m, n, row;
1001
1002 PetscFunctionBegin;
1003 if (ll) {
1004 /* The local size is used so that VecMPI can be passed to this routine
1005 by MatDiagonalScale_MPISELL */
1006 PetscCall(VecGetLocalSize(ll, &m));
1007 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1008 PetscCall(VecGetArrayRead(ll, &l));
1009 for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1010 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1011 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1012 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
1013 }
1014 } else {
1015 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) a->val[j] *= l[a->sliceheight * i + row];
1016 }
1017 }
1018 PetscCall(VecRestoreArrayRead(ll, &l));
1019 PetscCall(PetscLogFlops(a->nz));
1020 }
1021 if (rr) {
1022 PetscCall(VecGetLocalSize(rr, &n));
1023 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1024 PetscCall(VecGetArrayRead(rr, &r));
1025 for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1026 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1027 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1028 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1029 }
1030 } else {
1031 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1032 }
1033 }
1034 PetscCall(VecRestoreArrayRead(rr, &r));
1035 PetscCall(PetscLogFlops(a->nz));
1036 }
1037 #if defined(PETSC_HAVE_CUPM)
1038 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1039 #endif
1040 PetscFunctionReturn(PETSC_SUCCESS);
1041 }
1042
MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])1043 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1044 {
1045 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1046 PetscInt *cp, i, k, low, high, t, row, col, l;
1047 PetscInt shift;
1048 MatScalar *vp;
1049
1050 PetscFunctionBegin;
1051 for (k = 0; k < m; k++) { /* loop over requested rows */
1052 row = im[k];
1053 if (row < 0) continue;
1054 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1055 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1056 cp = a->colidx + shift; /* pointer to the row */
1057 vp = a->val + shift; /* pointer to the row */
1058 for (l = 0; l < n; l++) { /* loop over requested columns */
1059 col = in[l];
1060 if (col < 0) continue;
1061 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1062 high = a->rlen[row];
1063 low = 0; /* assume unsorted */
1064 while (high - low > 5) {
1065 t = (low + high) / 2;
1066 if (*(cp + a->sliceheight * t) > col) high = t;
1067 else low = t;
1068 }
1069 for (i = low; i < high; i++) {
1070 if (*(cp + a->sliceheight * i) > col) break;
1071 if (*(cp + a->sliceheight * i) == col) {
1072 *v++ = *(vp + a->sliceheight * i);
1073 goto finished;
1074 }
1075 }
1076 *v++ = 0.0;
1077 finished:;
1078 }
1079 }
1080 PetscFunctionReturn(PETSC_SUCCESS);
1081 }
1082
MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)1083 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1084 {
1085 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1086 PetscInt i, j, m = A->rmap->n, shift;
1087 const char *name;
1088 PetscViewerFormat format;
1089
1090 PetscFunctionBegin;
1091 PetscCall(PetscViewerGetFormat(viewer, &format));
1092 if (format == PETSC_VIEWER_ASCII_MATLAB) {
1093 PetscInt nofinalvalue = 0;
1094 /*
1095 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) nofinalvalue = 1;
1096 */
1097 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1098 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1099 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1100 #if defined(PETSC_USE_COMPLEX)
1101 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1102 #else
1103 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1104 #endif
1105 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1106
1107 for (i = 0; i < m; i++) {
1108 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1109 for (j = 0; j < a->rlen[i]; j++) {
1110 #if defined(PETSC_USE_COMPLEX)
1111 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1112 #else
1113 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1114 #endif
1115 }
1116 }
1117 /*
1118 if (nofinalvalue) {
1119 #if defined(PETSC_USE_COMPLEX)
1120 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1121 #else
1122 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0));
1123 #endif
1124 }
1125 */
1126 PetscCall(PetscObjectGetName((PetscObject)A, &name));
1127 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1128 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1129 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1130 PetscFunctionReturn(PETSC_SUCCESS);
1131 } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1132 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1133 for (i = 0; i < m; i++) {
1134 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1135 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1136 for (j = 0; j < a->rlen[i]; j++) {
1137 #if defined(PETSC_USE_COMPLEX)
1138 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1139 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1140 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1141 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1142 } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1143 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1144 }
1145 #else
1146 if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1147 #endif
1148 }
1149 PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1150 }
1151 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1152 } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1153 PetscInt cnt = 0, jcnt;
1154 PetscScalar value;
1155 #if defined(PETSC_USE_COMPLEX)
1156 PetscBool realonly = PETSC_TRUE;
1157 for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1158 if (PetscImaginaryPart(a->val[i]) != 0.0) {
1159 realonly = PETSC_FALSE;
1160 break;
1161 }
1162 }
1163 #endif
1164
1165 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1166 for (i = 0; i < m; i++) {
1167 jcnt = 0;
1168 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1169 for (j = 0; j < A->cmap->n; j++) {
1170 if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1171 value = a->val[cnt++];
1172 jcnt++;
1173 } else {
1174 value = 0.0;
1175 }
1176 #if defined(PETSC_USE_COMPLEX)
1177 if (realonly) {
1178 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1179 } else {
1180 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1181 }
1182 #else
1183 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1184 #endif
1185 }
1186 PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1187 }
1188 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1189 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1190 PetscInt fshift = 1;
1191 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1192 #if defined(PETSC_USE_COMPLEX)
1193 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1194 #else
1195 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1196 #endif
1197 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1198 for (i = 0; i < m; i++) {
1199 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1200 for (j = 0; j < a->rlen[i]; j++) {
1201 #if defined(PETSC_USE_COMPLEX)
1202 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1203 #else
1204 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1205 #endif
1206 }
1207 }
1208 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1209 } else if (format == PETSC_VIEWER_NATIVE) {
1210 for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1211 PetscInt row;
1212 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1213 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1214 #if defined(PETSC_USE_COMPLEX)
1215 if (PetscImaginaryPart(a->val[j]) > 0.0) {
1216 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1217 } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1218 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1219 } else {
1220 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1221 }
1222 #else
1223 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1224 #endif
1225 }
1226 }
1227 } else {
1228 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1229 if (A->factortype) {
1230 for (i = 0; i < m; i++) {
1231 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1232 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1233 /* L part */
1234 for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1235 #if defined(PETSC_USE_COMPLEX)
1236 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1237 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1238 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1239 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1240 } else {
1241 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1242 }
1243 #else
1244 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1245 #endif
1246 }
1247 /* diagonal */
1248 j = a->diag[i];
1249 #if defined(PETSC_USE_COMPLEX)
1250 if (PetscImaginaryPart(a->val[j]) > 0.0) {
1251 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1252 } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1253 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1254 } else {
1255 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1256 }
1257 #else
1258 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1 / a->val[j])));
1259 #endif
1260
1261 /* U part */
1262 for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1263 #if defined(PETSC_USE_COMPLEX)
1264 if (PetscImaginaryPart(a->val[j]) > 0.0) {
1265 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1266 } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1267 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1268 } else {
1269 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1270 }
1271 #else
1272 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1273 #endif
1274 }
1275 PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1276 }
1277 } else {
1278 for (i = 0; i < m; i++) {
1279 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1280 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1281 for (j = 0; j < a->rlen[i]; j++) {
1282 #if defined(PETSC_USE_COMPLEX)
1283 if (PetscImaginaryPart(a->val[j]) > 0.0) {
1284 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1285 } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1286 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1287 } else {
1288 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1289 }
1290 #else
1291 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1292 #endif
1293 }
1294 PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1295 }
1296 }
1297 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1298 }
1299 PetscCall(PetscViewerFlush(viewer));
1300 PetscFunctionReturn(PETSC_SUCCESS);
1301 }
1302
1303 #include <petscdraw.h>
MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void * Aa)1304 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1305 {
1306 Mat A = (Mat)Aa;
1307 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1308 PetscInt i, j, m = A->rmap->n, shift;
1309 int color;
1310 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1311 PetscViewer viewer;
1312 PetscViewerFormat format;
1313
1314 PetscFunctionBegin;
1315 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1316 PetscCall(PetscViewerGetFormat(viewer, &format));
1317 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1318
1319 /* loop over matrix elements drawing boxes */
1320
1321 if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1322 PetscDrawCollectiveBegin(draw);
1323 /* Blue for negative, Cyan for zero and Red for positive */
1324 color = PETSC_DRAW_BLUE;
1325 for (i = 0; i < m; i++) {
1326 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1327 y_l = m - i - 1.0;
1328 y_r = y_l + 1.0;
1329 for (j = 0; j < a->rlen[i]; j++) {
1330 x_l = a->colidx[shift + a->sliceheight * j];
1331 x_r = x_l + 1.0;
1332 if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1333 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1334 }
1335 }
1336 color = PETSC_DRAW_CYAN;
1337 for (i = 0; i < m; i++) {
1338 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1339 y_l = m - i - 1.0;
1340 y_r = y_l + 1.0;
1341 for (j = 0; j < a->rlen[i]; j++) {
1342 x_l = a->colidx[shift + a->sliceheight * j];
1343 x_r = x_l + 1.0;
1344 if (a->val[shift + a->sliceheight * j] != 0.) continue;
1345 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1346 }
1347 }
1348 color = PETSC_DRAW_RED;
1349 for (i = 0; i < m; i++) {
1350 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1351 y_l = m - i - 1.0;
1352 y_r = y_l + 1.0;
1353 for (j = 0; j < a->rlen[i]; j++) {
1354 x_l = a->colidx[shift + a->sliceheight * j];
1355 x_r = x_l + 1.0;
1356 if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1357 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1358 }
1359 }
1360 PetscDrawCollectiveEnd(draw);
1361 } else {
1362 /* use contour shading to indicate magnitude of values */
1363 /* first determine max of all nonzero values */
1364 PetscReal minv = 0.0, maxv = 0.0;
1365 PetscInt count = 0;
1366 PetscDraw popup;
1367 for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1368 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1369 }
1370 if (minv >= maxv) maxv = minv + PETSC_SMALL;
1371 PetscCall(PetscDrawGetPopup(draw, &popup));
1372 PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1373
1374 PetscDrawCollectiveBegin(draw);
1375 for (i = 0; i < m; i++) {
1376 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1377 y_l = m - i - 1.0;
1378 y_r = y_l + 1.0;
1379 for (j = 0; j < a->rlen[i]; j++) {
1380 x_l = a->colidx[shift + a->sliceheight * j];
1381 x_r = x_l + 1.0;
1382 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1383 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1384 count++;
1385 }
1386 }
1387 PetscDrawCollectiveEnd(draw);
1388 }
1389 PetscFunctionReturn(PETSC_SUCCESS);
1390 }
1391
1392 #include <petscdraw.h>
MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)1393 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1394 {
1395 PetscDraw draw;
1396 PetscReal xr, yr, xl, yl, h, w;
1397 PetscBool isnull;
1398
1399 PetscFunctionBegin;
1400 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1401 PetscCall(PetscDrawIsNull(draw, &isnull));
1402 if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1403
1404 xr = A->cmap->n;
1405 yr = A->rmap->n;
1406 h = yr / 10.0;
1407 w = xr / 10.0;
1408 xr += w;
1409 yr += h;
1410 xl = -w;
1411 yl = -h;
1412 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1413 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1414 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1415 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1416 PetscCall(PetscDrawSave(draw));
1417 PetscFunctionReturn(PETSC_SUCCESS);
1418 }
1419
MatView_SeqSELL(Mat A,PetscViewer viewer)1420 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1421 {
1422 PetscBool isascii, isbinary, isdraw;
1423
1424 PetscFunctionBegin;
1425 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
1426 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1427 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1428 if (isascii) {
1429 PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1430 } else if (isbinary) {
1431 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1432 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1433 PetscFunctionReturn(PETSC_SUCCESS);
1434 }
1435
MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)1436 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1437 {
1438 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1439 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1440 MatScalar *vp;
1441 #if defined(PETSC_HAVE_CUPM)
1442 PetscInt totalchunks = 0;
1443 #endif
1444
1445 PetscFunctionBegin;
1446 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1447 /* To do: compress out the unused elements */
1448 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1449 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1450 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1451 a->nonzerorowcnt = 0;
1452 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1453 for (i = 0; i < a->totalslices; ++i) {
1454 shift = a->sliidx[i]; /* starting index of the slice */
1455 cp = PetscSafePointerPlusOffset(a->colidx, shift); /* pointer to the column indices of the slice */
1456 vp = PetscSafePointerPlusOffset(a->val, shift); /* pointer to the nonzero values of the slice */
1457 for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1458 row = a->sliceheight * i + row_in_slice;
1459 nrow = a->rlen[row]; /* number of nonzeros in row */
1460 /*
1461 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1462 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1463 */
1464 lastcol = 0;
1465 if (nrow > 0) { /* nonempty row */
1466 a->nonzerorowcnt++;
1467 lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1468 } else if (!row_in_slice) { /* first row of the correct slice is empty */
1469 for (j = 1; j < a->sliceheight; j++) {
1470 if (a->rlen[a->sliceheight * i + j]) {
1471 lastcol = cp[j];
1472 break;
1473 }
1474 }
1475 } else {
1476 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1477 }
1478
1479 for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1480 cp[a->sliceheight * k + row_in_slice] = lastcol;
1481 vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1482 }
1483 }
1484 }
1485
1486 A->info.mallocs += a->reallocs;
1487 a->reallocs = 0;
1488
1489 #if defined(PETSC_HAVE_CUPM)
1490 if (!a->chunksize && a->totalslices) {
1491 a->chunksize = 64;
1492 while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2;
1493 totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize;
1494 }
1495 if (totalchunks != a->totalchunks) {
1496 PetscCall(PetscFree(a->chunk_slice_map));
1497 PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map));
1498 a->totalchunks = totalchunks;
1499 }
1500 j = 0;
1501 for (i = 0; i < totalchunks; i++) {
1502 while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++;
1503 a->chunk_slice_map[i] = j;
1504 }
1505 #endif
1506 PetscFunctionReturn(PETSC_SUCCESS);
1507 }
1508
MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo * info)1509 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1510 {
1511 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1512
1513 PetscFunctionBegin;
1514 info->block_size = 1.0;
1515 info->nz_allocated = a->maxallocmat;
1516 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */
1517 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]);
1518 info->assemblies = A->num_ass;
1519 info->mallocs = A->info.mallocs;
1520 info->memory = 0; /* REVIEW ME */
1521 if (A->factortype) {
1522 info->fill_ratio_given = A->info.fill_ratio_given;
1523 info->fill_ratio_needed = A->info.fill_ratio_needed;
1524 info->factor_mallocs = A->info.factor_mallocs;
1525 } else {
1526 info->fill_ratio_given = 0;
1527 info->fill_ratio_needed = 0;
1528 info->factor_mallocs = 0;
1529 }
1530 PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532
MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)1533 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1534 {
1535 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1536 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow;
1537 PetscInt *cp, nonew = a->nonew, lastcol = -1;
1538 MatScalar *vp, value;
1539 #if defined(PETSC_HAVE_CUPM)
1540 PetscBool inserted = PETSC_FALSE;
1541 PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight;
1542 #endif
1543
1544 PetscFunctionBegin;
1545 for (k = 0; k < m; k++) { /* loop over added rows */
1546 row = im[k];
1547 if (row < 0) continue;
1548 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1549 shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1550 cp = a->colidx + shift; /* pointer to the row */
1551 vp = a->val + shift; /* pointer to the row */
1552 nrow = a->rlen[row];
1553 low = 0;
1554 high = nrow;
1555
1556 for (l = 0; l < n; l++) { /* loop over added columns */
1557 col = in[l];
1558 if (col < 0) continue;
1559 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1560 if (a->roworiented) {
1561 value = v[l + k * n];
1562 } else {
1563 value = v[k + l * m];
1564 }
1565 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1566
1567 /* search in this row for the specified column, i indicates the column to be set */
1568 if (col <= lastcol) low = 0;
1569 else high = nrow;
1570 lastcol = col;
1571 while (high - low > 5) {
1572 t = (low + high) / 2;
1573 if (*(cp + a->sliceheight * t) > col) high = t;
1574 else low = t;
1575 }
1576 for (i = low; i < high; i++) {
1577 if (*(cp + a->sliceheight * i) > col) break;
1578 if (*(cp + a->sliceheight * i) == col) {
1579 if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1580 else *(vp + a->sliceheight * i) = value;
1581 #if defined(PETSC_HAVE_CUPM)
1582 inserted = PETSC_TRUE;
1583 #endif
1584 low = i + 1;
1585 goto noinsert;
1586 }
1587 }
1588 if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1589 if (nonew == 1) goto noinsert;
1590 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1591 #if defined(PETSC_HAVE_CUPM)
1592 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1593 #else
1594 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1595 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1596 #endif
1597 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1598 for (ii = nrow - 1; ii >= i; ii--) {
1599 *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1600 *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1601 }
1602 a->rlen[row]++;
1603 *(cp + a->sliceheight * i) = col;
1604 *(vp + a->sliceheight * i) = value;
1605 a->nz++;
1606 #if defined(PETSC_HAVE_CUPM)
1607 inserted = PETSC_TRUE;
1608 #endif
1609 low = i + 1;
1610 high++;
1611 nrow++;
1612 noinsert:;
1613 }
1614 a->rlen[row] = nrow;
1615 }
1616 #if defined(PETSC_HAVE_CUPM)
1617 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1618 #endif
1619 PetscFunctionReturn(PETSC_SUCCESS);
1620 }
1621
MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)1622 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1623 {
1624 PetscFunctionBegin;
1625 /* If the two matrices have the same copy implementation, use fast copy. */
1626 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1627 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1628 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1629
1630 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1631 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1632 } else {
1633 PetscCall(MatCopy_Basic(A, B, str));
1634 }
1635 PetscFunctionReturn(PETSC_SUCCESS);
1636 }
1637
MatSetUp_SeqSELL(Mat A)1638 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1639 {
1640 PetscFunctionBegin;
1641 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1642 PetscFunctionReturn(PETSC_SUCCESS);
1643 }
1644
MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar * array[])1645 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1646 {
1647 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1648
1649 PetscFunctionBegin;
1650 *array = a->val;
1651 PetscFunctionReturn(PETSC_SUCCESS);
1652 }
1653
MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar * array[])1654 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1655 {
1656 PetscFunctionBegin;
1657 PetscFunctionReturn(PETSC_SUCCESS);
1658 }
1659
MatScale_SeqSELL(Mat inA,PetscScalar alpha)1660 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1661 {
1662 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data;
1663 MatScalar *aval = a->val;
1664 PetscScalar oalpha = alpha;
1665 PetscBLASInt one = 1, size;
1666
1667 PetscFunctionBegin;
1668 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1669 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1670 PetscCall(PetscLogFlops(a->nz));
1671 #if defined(PETSC_HAVE_CUPM)
1672 if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1673 #endif
1674 PetscFunctionReturn(PETSC_SUCCESS);
1675 }
1676
MatShift_SeqSELL(Mat Y,PetscScalar a)1677 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1678 {
1679 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1680
1681 PetscFunctionBegin;
1682 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1683 PetscCall(MatShift_Basic(Y, a));
1684 PetscFunctionReturn(PETSC_SUCCESS);
1685 }
1686
MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)1687 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1688 {
1689 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1690 PetscScalar *x, sum, *t;
1691 const MatScalar *idiag = NULL, *mdiag;
1692 const PetscScalar *b, *xb;
1693 PetscInt n, m = A->rmap->n, i, j, shift;
1694 const PetscInt *diag;
1695
1696 PetscFunctionBegin;
1697 its = its * lits;
1698
1699 PetscCall(MatInvertDiagonalForSOR_SeqSELL(A, omega, fshift));
1700 diag = a->diag;
1701 t = a->ssor_work;
1702 idiag = a->idiag;
1703 mdiag = a->mdiag;
1704
1705 PetscCall(VecGetArray(xx, &x));
1706 PetscCall(VecGetArrayRead(bb, &b));
1707 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1708 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1709 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1710 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1711
1712 if (flag & SOR_ZERO_INITIAL_GUESS) {
1713 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1714 for (i = 0; i < m; i++) {
1715 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1716 sum = b[i];
1717 n = (diag[i] - shift) / a->sliceheight;
1718 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1719 t[i] = sum;
1720 x[i] = sum * idiag[i];
1721 }
1722 xb = t;
1723 PetscCall(PetscLogFlops(a->nz));
1724 } else xb = b;
1725 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1726 for (i = m - 1; i >= 0; i--) {
1727 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1728 sum = xb[i];
1729 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1730 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1731 if (xb == b) {
1732 x[i] = sum * idiag[i];
1733 } else {
1734 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1735 }
1736 }
1737 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1738 }
1739 its--;
1740 }
1741 while (its--) {
1742 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1743 for (i = 0; i < m; i++) {
1744 /* lower */
1745 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1746 sum = b[i];
1747 n = (diag[i] - shift) / a->sliceheight;
1748 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1749 t[i] = sum; /* save application of the lower-triangular part */
1750 /* upper */
1751 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1752 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1753 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1754 }
1755 xb = t;
1756 PetscCall(PetscLogFlops(2.0 * a->nz));
1757 } else xb = b;
1758 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1759 for (i = m - 1; i >= 0; i--) {
1760 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1761 sum = xb[i];
1762 if (xb == b) {
1763 /* whole matrix (no checkpointing available) */
1764 n = a->rlen[i];
1765 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1766 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1767 } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1768 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1769 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1770 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1771 }
1772 }
1773 if (xb == b) {
1774 PetscCall(PetscLogFlops(2.0 * a->nz));
1775 } else {
1776 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1777 }
1778 }
1779 }
1780 PetscCall(VecRestoreArray(xx, &x));
1781 PetscCall(VecRestoreArrayRead(bb, &b));
1782 PetscFunctionReturn(PETSC_SUCCESS);
1783 }
1784
1785 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1786 MatGetRow_SeqSELL,
1787 MatRestoreRow_SeqSELL,
1788 MatMult_SeqSELL,
1789 /* 4*/ MatMultAdd_SeqSELL,
1790 MatMultTranspose_SeqSELL,
1791 MatMultTransposeAdd_SeqSELL,
1792 NULL,
1793 NULL,
1794 NULL,
1795 /* 10*/ NULL,
1796 NULL,
1797 NULL,
1798 MatSOR_SeqSELL,
1799 NULL,
1800 /* 15*/ MatGetInfo_SeqSELL,
1801 MatEqual_SeqSELL,
1802 MatGetDiagonal_SeqSELL,
1803 MatDiagonalScale_SeqSELL,
1804 NULL,
1805 /* 20*/ NULL,
1806 MatAssemblyEnd_SeqSELL,
1807 MatSetOption_SeqSELL,
1808 MatZeroEntries_SeqSELL,
1809 /* 24*/ NULL,
1810 NULL,
1811 NULL,
1812 NULL,
1813 NULL,
1814 /* 29*/ MatSetUp_SeqSELL,
1815 NULL,
1816 NULL,
1817 NULL,
1818 NULL,
1819 /* 34*/ MatDuplicate_SeqSELL,
1820 NULL,
1821 NULL,
1822 NULL,
1823 NULL,
1824 /* 39*/ NULL,
1825 NULL,
1826 NULL,
1827 MatGetValues_SeqSELL,
1828 MatCopy_SeqSELL,
1829 /* 44*/ NULL,
1830 MatScale_SeqSELL,
1831 MatShift_SeqSELL,
1832 NULL,
1833 NULL,
1834 /* 49*/ NULL,
1835 NULL,
1836 NULL,
1837 NULL,
1838 NULL,
1839 /* 54*/ MatFDColoringCreate_SeqXAIJ,
1840 NULL,
1841 NULL,
1842 NULL,
1843 NULL,
1844 /* 59*/ NULL,
1845 MatDestroy_SeqSELL,
1846 MatView_SeqSELL,
1847 NULL,
1848 NULL,
1849 /* 64*/ NULL,
1850 NULL,
1851 NULL,
1852 NULL,
1853 NULL,
1854 /* 69*/ NULL,
1855 NULL,
1856 NULL,
1857 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1858 NULL,
1859 /* 74*/ NULL,
1860 NULL,
1861 NULL,
1862 NULL,
1863 NULL,
1864 /* 79*/ NULL,
1865 NULL,
1866 NULL,
1867 NULL,
1868 NULL,
1869 /* 84*/ NULL,
1870 NULL,
1871 NULL,
1872 NULL,
1873 NULL,
1874 /* 89*/ NULL,
1875 NULL,
1876 NULL,
1877 NULL,
1878 MatConjugate_SeqSELL,
1879 /* 94*/ NULL,
1880 NULL,
1881 NULL,
1882 NULL,
1883 NULL,
1884 /* 99*/ NULL,
1885 NULL,
1886 NULL,
1887 NULL,
1888 NULL,
1889 /*104*/ NULL,
1890 NULL,
1891 NULL,
1892 NULL,
1893 NULL,
1894 /*109*/ NULL,
1895 NULL,
1896 NULL,
1897 NULL,
1898 NULL,
1899 /*114*/ NULL,
1900 NULL,
1901 NULL,
1902 NULL,
1903 NULL,
1904 /*119*/ NULL,
1905 NULL,
1906 NULL,
1907 NULL,
1908 NULL,
1909 /*124*/ NULL,
1910 NULL,
1911 NULL,
1912 NULL,
1913 MatFDColoringSetUp_SeqXAIJ,
1914 /*129*/ NULL,
1915 NULL,
1916 NULL,
1917 NULL,
1918 NULL,
1919 /*134*/ NULL,
1920 NULL,
1921 NULL,
1922 NULL,
1923 NULL,
1924 /*139*/ NULL,
1925 NULL,
1926 NULL,
1927 NULL,
1928 NULL};
1929
MatStoreValues_SeqSELL(Mat mat)1930 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1931 {
1932 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1933
1934 PetscFunctionBegin;
1935 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1936
1937 /* allocate space for values if not already there */
1938 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1939
1940 /* copy values over */
1941 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1942 PetscFunctionReturn(PETSC_SUCCESS);
1943 }
1944
MatRetrieveValues_SeqSELL(Mat mat)1945 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1946 {
1947 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1948
1949 PetscFunctionBegin;
1950 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1951 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1952 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1953 PetscFunctionReturn(PETSC_SUCCESS);
1954 }
1955
MatSeqSELLGetFillRatio_SeqSELL(Mat mat,PetscReal * ratio)1956 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
1957 {
1958 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1959
1960 PetscFunctionBegin;
1961 if (a->totalslices && a->sliidx[a->totalslices]) {
1962 *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
1963 } else {
1964 *ratio = 0.0;
1965 }
1966 PetscFunctionReturn(PETSC_SUCCESS);
1967 }
1968
MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat,PetscInt * slicewidth)1969 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
1970 {
1971 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1972 PetscInt i, current_slicewidth;
1973
1974 PetscFunctionBegin;
1975 *slicewidth = 0;
1976 for (i = 0; i < a->totalslices; i++) {
1977 current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
1978 if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
1979 }
1980 PetscFunctionReturn(PETSC_SUCCESS);
1981 }
1982
MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat,PetscReal * slicewidth)1983 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
1984 {
1985 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1986
1987 PetscFunctionBegin;
1988 *slicewidth = 0;
1989 if (a->totalslices) *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices;
1990 PetscFunctionReturn(PETSC_SUCCESS);
1991 }
1992
MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat,PetscReal * variance)1993 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance)
1994 {
1995 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1996 PetscReal mean;
1997 PetscInt i, totalslices = a->totalslices, *sliidx = a->sliidx;
1998
1999 PetscFunctionBegin;
2000 *variance = 0;
2001 if (totalslices) {
2002 mean = (PetscReal)sliidx[totalslices] / totalslices;
2003 for (i = 1; i <= totalslices; i++) *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices;
2004 }
2005 PetscFunctionReturn(PETSC_SUCCESS);
2006 }
2007
MatSeqSELLSetSliceHeight_SeqSELL(Mat A,PetscInt sliceheight)2008 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2009 {
2010 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2011
2012 PetscFunctionBegin;
2013 if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2014 PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2015 a->sliceheight = sliceheight;
2016 #if defined(PETSC_HAVE_CUPM)
2017 PetscCheck(PetscMax(DEVICE_MEM_ALIGN, sliceheight) % PetscMin(DEVICE_MEM_ALIGN, sliceheight) == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The slice height is not compatible with DEVICE_MEM_ALIGN (one must be divisible by the other) %" PetscInt_FMT, sliceheight);
2018 #endif
2019 PetscFunctionReturn(PETSC_SUCCESS);
2020 }
2021
2022 /*@
2023 MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2024
2025 Not Collective
2026
2027 Input Parameter:
2028 . A - a MATSEQSELL matrix
2029
2030 Output Parameter:
2031 . ratio - ratio of number of padded zeros to number of allocated elements
2032
2033 Level: intermediate
2034
2035 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2036 @*/
MatSeqSELLGetFillRatio(Mat A,PetscReal * ratio)2037 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2038 {
2039 PetscFunctionBegin;
2040 PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2041 PetscFunctionReturn(PETSC_SUCCESS);
2042 }
2043
2044 /*@
2045 MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2046
2047 Not Collective
2048
2049 Input Parameter:
2050 . A - a MATSEQSELL matrix
2051
2052 Output Parameter:
2053 . slicewidth - maximum slice width
2054
2055 Level: intermediate
2056
2057 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2058 @*/
MatSeqSELLGetMaxSliceWidth(Mat A,PetscInt * slicewidth)2059 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2060 {
2061 PetscFunctionBegin;
2062 PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2063 PetscFunctionReturn(PETSC_SUCCESS);
2064 }
2065
2066 /*@
2067 MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2068
2069 Not Collective
2070
2071 Input Parameter:
2072 . A - a MATSEQSELL matrix
2073
2074 Output Parameter:
2075 . slicewidth - average slice width
2076
2077 Level: intermediate
2078
2079 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()`
2080 @*/
MatSeqSELLGetAvgSliceWidth(Mat A,PetscReal * slicewidth)2081 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2082 {
2083 PetscFunctionBegin;
2084 PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2085 PetscFunctionReturn(PETSC_SUCCESS);
2086 }
2087
2088 /*@
2089 MatSeqSELLSetSliceHeight - sets the slice height.
2090
2091 Not Collective
2092
2093 Input Parameters:
2094 + A - a MATSEQSELL matrix
2095 - sliceheight - slice height
2096
2097 Notes:
2098 You cannot change the slice height once it have been set.
2099
2100 The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2101
2102 Level: intermediate
2103
2104 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()`
2105 @*/
MatSeqSELLSetSliceHeight(Mat A,PetscInt sliceheight)2106 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2107 {
2108 PetscFunctionBegin;
2109 PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2110 PetscFunctionReturn(PETSC_SUCCESS);
2111 }
2112
2113 /*@
2114 MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2115
2116 Not Collective
2117
2118 Input Parameter:
2119 . A - a MATSEQSELL matrix
2120
2121 Output Parameter:
2122 . variance - variance of the slice size
2123
2124 Level: intermediate
2125
2126 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()`
2127 @*/
MatSeqSELLGetVarSliceSize(Mat A,PetscReal * variance)2128 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2129 {
2130 PetscFunctionBegin;
2131 PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2132 PetscFunctionReturn(PETSC_SUCCESS);
2133 }
2134
2135 #if defined(PETSC_HAVE_CUDA)
2136 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2137 #endif
2138 #if defined(PETSC_HAVE_HIP)
2139 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat);
2140 #endif
2141
MatCreate_SeqSELL(Mat B)2142 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2143 {
2144 Mat_SeqSELL *b;
2145 PetscMPIInt size;
2146
2147 PetscFunctionBegin;
2148 PetscCall(PetscCitationsRegister(citation, &cited));
2149 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2150 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2151
2152 PetscCall(PetscNew(&b));
2153
2154 B->data = (void *)b;
2155 B->ops[0] = MatOps_Values;
2156
2157 b->row = NULL;
2158 b->col = NULL;
2159 b->icol = NULL;
2160 b->reallocs = 0;
2161 b->ignorezeroentries = PETSC_FALSE;
2162 b->roworiented = PETSC_TRUE;
2163 b->nonew = 0;
2164 b->diag = NULL;
2165 b->solve_work = NULL;
2166 B->spptr = NULL;
2167 b->saved_values = NULL;
2168 b->idiag = NULL;
2169 b->mdiag = NULL;
2170 b->ssor_work = NULL;
2171 b->omega = 1.0;
2172 b->fshift = 0.0;
2173 b->keepnonzeropattern = PETSC_FALSE;
2174 b->sliceheight = 0;
2175
2176 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2177 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2178 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2179 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2180 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2181 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2182 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
2183 #if defined(PETSC_HAVE_CUDA)
2184 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA));
2185 #endif
2186 #if defined(PETSC_HAVE_HIP)
2187 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellhip_C", MatConvert_SeqSELL_SeqSELLHIP));
2188 #endif
2189 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2190 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2191 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2192 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL));
2193 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2194
2195 PetscObjectOptionsBegin((PetscObject)B);
2196 {
2197 PetscInt newsh = -1;
2198 PetscBool flg;
2199 #if defined(PETSC_HAVE_CUPM)
2200 PetscInt chunksize = 0;
2201 #endif
2202
2203 PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2204 if (flg) PetscCall(MatSeqSELLSetSliceHeight(B, newsh));
2205 #if defined(PETSC_HAVE_CUPM)
2206 PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA/HIP kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg));
2207 if (flg) {
2208 PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize);
2209 b->chunksize = chunksize;
2210 }
2211 #endif
2212 }
2213 PetscOptionsEnd();
2214 PetscFunctionReturn(PETSC_SUCCESS);
2215 }
2216
2217 /*
2218 Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2219 */
MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)2220 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2221 {
2222 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2223 PetscInt i, m = A->rmap->n;
2224 PetscInt totalslices = a->totalslices;
2225
2226 PetscFunctionBegin;
2227 C->factortype = A->factortype;
2228 c->row = NULL;
2229 c->col = NULL;
2230 c->icol = NULL;
2231 c->reallocs = 0;
2232 C->assembled = PETSC_TRUE;
2233
2234 PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2235 PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2236
2237 c->sliceheight = a->sliceheight;
2238 PetscCall(PetscMalloc1(c->sliceheight * totalslices, &c->rlen));
2239 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2240
2241 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2242 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2243
2244 /* allocate the matrix space */
2245 if (mallocmatspace) {
2246 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2247
2248 c->singlemalloc = PETSC_TRUE;
2249
2250 if (m > 0) {
2251 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2252 if (cpvalues == MAT_COPY_VALUES) {
2253 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2254 } else {
2255 PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2256 }
2257 }
2258 }
2259
2260 c->ignorezeroentries = a->ignorezeroentries;
2261 c->roworiented = a->roworiented;
2262 c->nonew = a->nonew;
2263 c->solve_work = NULL;
2264 c->saved_values = NULL;
2265 c->idiag = NULL;
2266 c->ssor_work = NULL;
2267 c->keepnonzeropattern = a->keepnonzeropattern;
2268 c->free_val = PETSC_TRUE;
2269 c->free_colidx = PETSC_TRUE;
2270
2271 c->maxallocmat = a->maxallocmat;
2272 c->maxallocrow = a->maxallocrow;
2273 c->rlenmax = a->rlenmax;
2274 c->nz = a->nz;
2275 C->preallocated = PETSC_TRUE;
2276
2277 c->nonzerorowcnt = a->nonzerorowcnt;
2278 C->nonzerostate = A->nonzerostate;
2279
2280 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2281 PetscFunctionReturn(PETSC_SUCCESS);
2282 }
2283
MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat * B)2284 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2285 {
2286 PetscFunctionBegin;
2287 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2288 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2289 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2290 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2291 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2292 PetscFunctionReturn(PETSC_SUCCESS);
2293 }
2294
2295 /*MC
2296 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2297 based on the sliced Ellpack format, {cite}`zhangellpack2018`
2298
2299 Options Database Key:
2300 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2301
2302 Level: beginner
2303
2304 .seealso: `Mat`, `MatCreateSeqSELL()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2305 M*/
2306
2307 /*MC
2308 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018`
2309
2310 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2311 and `MATMPISELL` otherwise. As a result, for single process communicators,
2312 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2313 for communicators controlling multiple processes. It is recommended that you call both of
2314 the above preallocation routines for simplicity.
2315
2316 Options Database Key:
2317 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2318
2319 Level: beginner
2320
2321 Notes:
2322 This format is only supported for real scalars, double precision, and 32-bit indices (the defaults).
2323
2324 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2325 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2326
2327 Developer Notes:
2328 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2329
2330 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2331 .vb
2332 (2 0 3 4)
2333 Consider the matrix A = (5 0 6 0)
2334 (0 0 7 8)
2335 (0 0 9 9)
2336
2337 symbolically the Ellpack format can be written as
2338
2339 (2 3 4 |) (0 2 3 |)
2340 v = (5 6 0 |) colidx = (0 2 2 |)
2341 -------- ---------
2342 (7 8 |) (2 3 |)
2343 (9 9 |) (2 3 |)
2344
2345 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2346 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2347 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2348
2349 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3)
2350
2351 .ve
2352
2353 See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance.
2354
2355 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSELL()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2356 M*/
2357
2358 /*@
2359 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2360
2361 Collective
2362
2363 Input Parameters:
2364 + comm - MPI communicator, set to `PETSC_COMM_SELF`
2365 . m - number of rows
2366 . n - number of columns
2367 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2368 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2369
2370 Output Parameter:
2371 . A - the matrix
2372
2373 Level: intermediate
2374
2375 Notes:
2376 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2377 MatXXXXSetPreallocation() paradigm instead of this routine directly.
2378 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2379
2380 Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2381 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2382 allocation.
2383
2384 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL`
2385 @*/
MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt rlenmax,const PetscInt rlen[],Mat * A)2386 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2387 {
2388 PetscFunctionBegin;
2389 PetscCall(MatCreate(comm, A));
2390 PetscCall(MatSetSizes(*A, m, n, m, n));
2391 PetscCall(MatSetType(*A, MATSEQSELL));
2392 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2393 PetscFunctionReturn(PETSC_SUCCESS);
2394 }
2395
MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)2396 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2397 {
2398 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2399 PetscInt totalslices = a->totalslices;
2400
2401 PetscFunctionBegin;
2402 /* If the matrix dimensions are not equal,or no of nonzeros */
2403 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2404 *flg = PETSC_FALSE;
2405 PetscFunctionReturn(PETSC_SUCCESS);
2406 }
2407 /* if the a->colidx are the same */
2408 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2409 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2410 /* if a->val are the same */
2411 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2412 PetscFunctionReturn(PETSC_SUCCESS);
2413 }
2414
MatConjugate_SeqSELL(Mat A)2415 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2416 {
2417 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2418 PetscScalar *val = a->val;
2419
2420 PetscFunctionBegin;
2421 for (PetscInt i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2422 #if defined(PETSC_HAVE_CUPM)
2423 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2424 #endif
2425 PetscFunctionReturn(PETSC_SUCCESS);
2426 }
2427