1 /* 2 Defines the basic matrix operations for the SELL matrix storage format. 3 */ 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 #include <petscblaslapack.h> 6 #include <petsc/private/kernels/blocktranspose.h> 7 8 static PetscBool cited = PETSC_FALSE; 9 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 10 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 11 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 12 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 13 " year = 2018\n" 14 "}\n"; 15 16 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 17 18 #include <immintrin.h> 19 20 #if !defined(_MM_SCALE_8) 21 #define _MM_SCALE_8 8 22 #endif 23 24 #if defined(__AVX512F__) 25 /* these do not work 26 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 27 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 28 */ 29 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 30 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 31 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 32 vec_vals = _mm512_loadu_pd(aval); \ 33 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 34 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 35 #elif defined(__AVX2__) && defined(__FMA__) 36 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 37 vec_vals = _mm256_loadu_pd(aval); \ 38 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 39 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 40 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 41 #endif 42 #endif /* PETSC_HAVE_IMMINTRIN_H */ 43 44 /*@ 45 MatSeqSELLSetPreallocation - For good matrix assembly performance 46 the user should preallocate the matrix storage by setting the parameter `nz` 47 (or the array `nnz`). 48 49 Collective 50 51 Input Parameters: 52 + B - The `MATSEQSELL` matrix 53 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided 54 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 55 56 Level: intermediate 57 58 Notes: 59 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 60 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 61 allocation. 62 63 You can call `MatGetInfo()` to get information on how effective the preallocation was; 64 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 65 You can also run with the option `-info` and look for messages with the string 66 malloc in them to see if additional memory allocation was needed. 67 68 Developer Notes: 69 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 70 entries or columns indices. 71 72 The maximum number of nonzeos in any row should be as accurate as possible. 73 If it is underestimated, you will get bad performance due to reallocation 74 (`MatSeqXSELLReallocateSELL()`). 75 76 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 77 @*/ 78 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 79 { 80 PetscFunctionBegin; 81 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 82 PetscValidType(B, 1); 83 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 84 PetscFunctionReturn(PETSC_SUCCESS); 85 } 86 87 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 88 { 89 Mat_SeqSELL *b; 90 PetscInt i, j, totalslices; 91 #if defined(PETSC_HAVE_CUPM) 92 PetscInt rlenmax = 0; 93 #endif 94 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 95 96 PetscFunctionBegin; 97 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 98 if (maxallocrow == MAT_SKIP_ALLOCATION) { 99 skipallocation = PETSC_TRUE; 100 maxallocrow = 0; 101 } 102 103 PetscCall(PetscLayoutSetUp(B->rmap)); 104 PetscCall(PetscLayoutSetUp(B->cmap)); 105 106 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 107 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 108 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 109 if (rlen) { 110 for (i = 0; i < B->rmap->n; i++) { 111 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 112 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 113 } 114 } 115 116 B->preallocated = PETSC_TRUE; 117 118 b = (Mat_SeqSELL *)B->data; 119 120 if (!b->sliceheight) { /* not set yet */ 121 #if defined(PETSC_HAVE_CUPM) 122 b->sliceheight = 16; 123 #else 124 b->sliceheight = 8; 125 #endif 126 } 127 totalslices = PetscCeilInt(B->rmap->n, b->sliceheight); 128 b->totalslices = totalslices; 129 if (!skipallocation) { 130 if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n)); 131 132 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 133 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 134 } 135 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 136 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 137 else if (maxallocrow < 0) maxallocrow = 1; 138 #if defined(PETSC_HAVE_CUPM) 139 rlenmax = maxallocrow; 140 /* Pad the slice to DEVICE_MEM_ALIGN */ 141 while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++; 142 #endif 143 for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow; 144 } else { 145 #if defined(PETSC_HAVE_CUPM) 146 PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight; 147 #endif 148 maxallocrow = 0; 149 b->sliidx[0] = 0; 150 for (i = 1; i < totalslices; i++) { 151 b->sliidx[i] = 0; 152 for (j = 0; j < b->sliceheight; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); 153 #if defined(PETSC_HAVE_CUPM) 154 if (mul != 0) { /* Pad the slice to DEVICE_MEM_ALIGN if sliceheight < DEVICE_MEM_ALIGN */ 155 rlenmax = PetscMax(b->sliidx[i], rlenmax); 156 b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul; 157 } 158 #endif 159 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 160 PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i])); 161 } 162 /* last slice */ 163 b->sliidx[totalslices] = 0; 164 for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 165 #if defined(PETSC_HAVE_CUPM) 166 if (mul != 0) { 167 rlenmax = PetscMax(b->sliidx[i], rlenmax); 168 b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul; 169 } 170 #endif 171 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 172 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices]; 173 } 174 175 /* allocate space for val, colidx, rlen */ 176 /* FIXME: should B's old memory be unlogged? */ 177 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 178 /* FIXME: assuming an element of the bit array takes 8 bits */ 179 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 180 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 181 PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen)); 182 183 b->singlemalloc = PETSC_TRUE; 184 b->free_val = PETSC_TRUE; 185 b->free_colidx = PETSC_TRUE; 186 } else { 187 b->free_val = PETSC_FALSE; 188 b->free_colidx = PETSC_FALSE; 189 } 190 191 b->nz = 0; 192 b->maxallocrow = maxallocrow; 193 #if defined(PETSC_HAVE_CUPM) 194 b->rlenmax = rlenmax; 195 #else 196 b->rlenmax = maxallocrow; 197 #endif 198 b->maxallocmat = b->sliidx[totalslices]; 199 B->info.nz_unneeded = (double)b->maxallocmat; 200 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 201 PetscFunctionReturn(PETSC_SUCCESS); 202 } 203 204 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 205 { 206 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 207 PetscInt shift; 208 209 PetscFunctionBegin; 210 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 211 if (nz) *nz = a->rlen[row]; 212 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); 213 if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); 214 if (idx) { 215 PetscInt j; 216 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j]; 217 *idx = a->getrowcols; 218 } 219 if (v) { 220 PetscInt j; 221 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j]; 222 *v = a->getrowvals; 223 } 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 228 { 229 PetscFunctionBegin; 230 PetscFunctionReturn(PETSC_SUCCESS); 231 } 232 233 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 234 { 235 Mat B; 236 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 237 PetscInt i; 238 239 PetscFunctionBegin; 240 if (reuse == MAT_REUSE_MATRIX) { 241 B = *newmat; 242 PetscCall(MatZeroEntries(B)); 243 } else { 244 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 245 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 246 PetscCall(MatSetType(B, MATSEQAIJ)); 247 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 248 } 249 250 for (i = 0; i < A->rmap->n; i++) { 251 PetscInt nz = 0, *cols = NULL; 252 PetscScalar *vals = NULL; 253 254 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 255 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 256 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 257 } 258 259 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 260 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 261 B->rmap->bs = A->rmap->bs; 262 263 if (reuse == MAT_INPLACE_MATRIX) { 264 PetscCall(MatHeaderReplace(A, &B)); 265 } else { 266 *newmat = B; 267 } 268 PetscFunctionReturn(PETSC_SUCCESS); 269 } 270 271 #include <../src/mat/impls/aij/seq/aij.h> 272 273 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 274 { 275 Mat B; 276 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 277 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 278 const PetscInt *cols; 279 const PetscScalar *vals; 280 281 PetscFunctionBegin; 282 if (reuse == MAT_REUSE_MATRIX) { 283 B = *newmat; 284 } else { 285 if (PetscDefined(USE_DEBUG) || !a->ilen) { 286 PetscCall(PetscMalloc1(m, &rowlengths)); 287 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 288 } 289 if (PetscDefined(USE_DEBUG) && a->ilen) { 290 PetscBool eq; 291 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 292 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 293 PetscCall(PetscFree(rowlengths)); 294 rowlengths = a->ilen; 295 } else if (a->ilen) rowlengths = a->ilen; 296 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 297 PetscCall(MatSetSizes(B, m, n, m, n)); 298 PetscCall(MatSetType(B, MATSEQSELL)); 299 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 300 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 301 } 302 303 for (row = 0; row < m; row++) { 304 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 305 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 306 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 307 } 308 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 309 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 310 B->rmap->bs = A->rmap->bs; 311 312 if (reuse == MAT_INPLACE_MATRIX) { 313 PetscCall(MatHeaderReplace(A, &B)); 314 } else { 315 *newmat = B; 316 } 317 PetscFunctionReturn(PETSC_SUCCESS); 318 } 319 320 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 321 { 322 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 323 PetscScalar *y; 324 const PetscScalar *x; 325 const MatScalar *aval = a->val; 326 PetscInt totalslices = a->totalslices; 327 const PetscInt *acolidx = a->colidx; 328 PetscInt i, j; 329 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 330 __m512d vec_x, vec_y, vec_vals; 331 __m256i vec_idx; 332 __mmask8 mask; 333 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 334 __m256i vec_idx2, vec_idx3, vec_idx4; 335 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 336 __m128i vec_idx; 337 __m256d vec_x, vec_y, vec_y2, vec_vals; 338 MatScalar yval; 339 PetscInt r, rows_left, row, nnz_in_row; 340 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 341 __m128d vec_x_tmp; 342 __m256d vec_x, vec_y, vec_y2, vec_vals; 343 MatScalar yval; 344 PetscInt r, rows_left, row, nnz_in_row; 345 #else 346 PetscInt k, sliceheight = a->sliceheight; 347 PetscScalar *sum; 348 #endif 349 350 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 351 #pragma disjoint(*x, *y, *aval) 352 #endif 353 354 PetscFunctionBegin; 355 PetscCall(VecGetArrayRead(xx, &x)); 356 PetscCall(VecGetArray(yy, &y)); 357 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 358 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 359 for (i = 0; i < totalslices; i++) { /* loop over slices */ 360 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 361 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 362 363 vec_y = _mm512_setzero_pd(); 364 vec_y2 = _mm512_setzero_pd(); 365 vec_y3 = _mm512_setzero_pd(); 366 vec_y4 = _mm512_setzero_pd(); 367 368 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 369 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 370 case 3: 371 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 372 acolidx += 8; 373 aval += 8; 374 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 375 acolidx += 8; 376 aval += 8; 377 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 378 acolidx += 8; 379 aval += 8; 380 j += 3; 381 break; 382 case 2: 383 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 384 acolidx += 8; 385 aval += 8; 386 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 387 acolidx += 8; 388 aval += 8; 389 j += 2; 390 break; 391 case 1: 392 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 393 acolidx += 8; 394 aval += 8; 395 j += 1; 396 break; 397 } 398 #pragma novector 399 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 400 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 401 acolidx += 8; 402 aval += 8; 403 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 404 acolidx += 8; 405 aval += 8; 406 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 407 acolidx += 8; 408 aval += 8; 409 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 410 acolidx += 8; 411 aval += 8; 412 } 413 414 vec_y = _mm512_add_pd(vec_y, vec_y2); 415 vec_y = _mm512_add_pd(vec_y, vec_y3); 416 vec_y = _mm512_add_pd(vec_y, vec_y4); 417 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 418 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 419 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 420 } else { 421 _mm512_storeu_pd(&y[8 * i], vec_y); 422 } 423 } 424 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 425 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 426 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 427 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 428 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 429 430 /* last slice may have padding rows. Don't use vectorization. */ 431 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 432 rows_left = A->rmap->n - 8 * i; 433 for (r = 0; r < rows_left; ++r) { 434 yval = (MatScalar)0; 435 row = 8 * i + r; 436 nnz_in_row = a->rlen[row]; 437 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 438 y[row] = yval; 439 } 440 break; 441 } 442 443 vec_y = _mm256_setzero_pd(); 444 vec_y2 = _mm256_setzero_pd(); 445 446 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 447 #pragma novector 448 #pragma unroll(2) 449 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 450 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 451 aval += 4; 452 acolidx += 4; 453 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 454 aval += 4; 455 acolidx += 4; 456 } 457 458 _mm256_storeu_pd(y + i * 8, vec_y); 459 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 460 } 461 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 462 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 463 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 464 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 465 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 466 467 vec_y = _mm256_setzero_pd(); 468 vec_y2 = _mm256_setzero_pd(); 469 470 /* last slice may have padding rows. Don't use vectorization. */ 471 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 472 rows_left = A->rmap->n - 8 * i; 473 for (r = 0; r < rows_left; ++r) { 474 yval = (MatScalar)0; 475 row = 8 * i + r; 476 nnz_in_row = a->rlen[row]; 477 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 478 y[row] = yval; 479 } 480 break; 481 } 482 483 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 484 #pragma novector 485 #pragma unroll(2) 486 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 487 vec_vals = _mm256_loadu_pd(aval); 488 vec_x_tmp = _mm_setzero_pd(); 489 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 490 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 491 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 492 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 493 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 494 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 495 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 496 aval += 4; 497 498 vec_vals = _mm256_loadu_pd(aval); 499 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 500 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 501 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 502 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 503 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 504 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 505 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 506 aval += 4; 507 } 508 509 _mm256_storeu_pd(y + i * 8, vec_y); 510 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 511 } 512 #else 513 PetscCall(PetscMalloc1(sliceheight, &sum)); 514 for (i = 0; i < totalslices; i++) { /* loop over slices */ 515 for (j = 0; j < sliceheight; j++) { 516 sum[j] = 0.0; 517 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 518 } 519 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */ 520 for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j]; 521 } else { 522 for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j]; 523 } 524 } 525 PetscCall(PetscFree(sum)); 526 #endif 527 528 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 529 PetscCall(VecRestoreArrayRead(xx, &x)); 530 PetscCall(VecRestoreArray(yy, &y)); 531 PetscFunctionReturn(PETSC_SUCCESS); 532 } 533 534 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 535 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 536 { 537 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 538 PetscScalar *y, *z; 539 const PetscScalar *x; 540 const MatScalar *aval = a->val; 541 PetscInt totalslices = a->totalslices; 542 const PetscInt *acolidx = a->colidx; 543 PetscInt i, j; 544 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 545 __m512d vec_x, vec_y, vec_vals; 546 __m256i vec_idx; 547 __mmask8 mask = 0; 548 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 549 __m256i vec_idx2, vec_idx3, vec_idx4; 550 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 551 __m128d vec_x_tmp; 552 __m256d vec_x, vec_y, vec_y2, vec_vals; 553 MatScalar yval; 554 PetscInt r, row, nnz_in_row; 555 #else 556 PetscInt k, sliceheight = a->sliceheight; 557 PetscScalar *sum; 558 #endif 559 560 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 561 #pragma disjoint(*x, *y, *aval) 562 #endif 563 564 PetscFunctionBegin; 565 if (!a->nz) { 566 PetscCall(VecCopy(yy, zz)); 567 PetscFunctionReturn(PETSC_SUCCESS); 568 } 569 PetscCall(VecGetArrayRead(xx, &x)); 570 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 571 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 572 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 573 for (i = 0; i < totalslices; i++) { /* loop over slices */ 574 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 575 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 576 577 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 578 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 579 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 580 } else { 581 vec_y = _mm512_loadu_pd(&y[8 * i]); 582 } 583 vec_y2 = _mm512_setzero_pd(); 584 vec_y3 = _mm512_setzero_pd(); 585 vec_y4 = _mm512_setzero_pd(); 586 587 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 588 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 589 case 3: 590 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 591 acolidx += 8; 592 aval += 8; 593 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 594 acolidx += 8; 595 aval += 8; 596 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 597 acolidx += 8; 598 aval += 8; 599 j += 3; 600 break; 601 case 2: 602 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 603 acolidx += 8; 604 aval += 8; 605 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 606 acolidx += 8; 607 aval += 8; 608 j += 2; 609 break; 610 case 1: 611 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 612 acolidx += 8; 613 aval += 8; 614 j += 1; 615 break; 616 } 617 #pragma novector 618 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 619 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 620 acolidx += 8; 621 aval += 8; 622 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 623 acolidx += 8; 624 aval += 8; 625 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 626 acolidx += 8; 627 aval += 8; 628 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 629 acolidx += 8; 630 aval += 8; 631 } 632 633 vec_y = _mm512_add_pd(vec_y, vec_y2); 634 vec_y = _mm512_add_pd(vec_y, vec_y3); 635 vec_y = _mm512_add_pd(vec_y, vec_y4); 636 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 637 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 638 } else { 639 _mm512_storeu_pd(&z[8 * i], vec_y); 640 } 641 } 642 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 643 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 644 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 645 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 646 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 647 648 /* last slice may have padding rows. Don't use vectorization. */ 649 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 650 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 651 row = 8 * i + r; 652 yval = (MatScalar)0.0; 653 nnz_in_row = a->rlen[row]; 654 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 655 z[row] = y[row] + yval; 656 } 657 break; 658 } 659 660 vec_y = _mm256_loadu_pd(y + 8 * i); 661 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 662 663 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 664 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 665 vec_vals = _mm256_loadu_pd(aval); 666 vec_x_tmp = _mm_setzero_pd(); 667 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 668 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 669 vec_x = _mm256_setzero_pd(); 670 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 671 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 672 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 673 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 674 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 675 aval += 4; 676 677 vec_vals = _mm256_loadu_pd(aval); 678 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 679 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 680 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 681 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 682 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 683 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 684 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 685 aval += 4; 686 } 687 688 _mm256_storeu_pd(z + i * 8, vec_y); 689 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 690 } 691 #else 692 PetscCall(PetscMalloc1(sliceheight, &sum)); 693 for (i = 0; i < totalslices; i++) { /* loop over slices */ 694 for (j = 0; j < sliceheight; j++) { 695 sum[j] = 0.0; 696 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 697 } 698 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 699 for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 700 } else { 701 for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 702 } 703 } 704 PetscCall(PetscFree(sum)); 705 #endif 706 707 PetscCall(PetscLogFlops(2.0 * a->nz)); 708 PetscCall(VecRestoreArrayRead(xx, &x)); 709 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 710 PetscFunctionReturn(PETSC_SUCCESS); 711 } 712 713 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 714 { 715 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 716 PetscScalar *y; 717 const PetscScalar *x; 718 const MatScalar *aval = a->val; 719 const PetscInt *acolidx = a->colidx; 720 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight; 721 722 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 723 #pragma disjoint(*x, *y, *aval) 724 #endif 725 726 PetscFunctionBegin; 727 if (A->symmetric == PETSC_BOOL3_TRUE) { 728 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 729 PetscFunctionReturn(PETSC_SUCCESS); 730 } 731 if (zz != yy) PetscCall(VecCopy(zz, yy)); 732 733 if (a->nz) { 734 PetscCall(VecGetArrayRead(xx, &x)); 735 PetscCall(VecGetArray(yy, &y)); 736 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 737 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 738 for (r = 0; r < (A->rmap->n % sliceheight); ++r) { 739 row = sliceheight * i + r; 740 nnz_in_row = a->rlen[row]; 741 for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row]; 742 } 743 break; 744 } 745 for (r = 0; r < sliceheight; ++r) 746 for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r]; 747 } 748 PetscCall(PetscLogFlops(2.0 * a->nz)); 749 PetscCall(VecRestoreArrayRead(xx, &x)); 750 PetscCall(VecRestoreArray(yy, &y)); 751 } 752 PetscFunctionReturn(PETSC_SUCCESS); 753 } 754 755 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 756 { 757 PetscFunctionBegin; 758 if (A->symmetric == PETSC_BOOL3_TRUE) { 759 PetscCall(MatMult_SeqSELL(A, xx, yy)); 760 } else { 761 PetscCall(VecSet(yy, 0.0)); 762 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 763 } 764 PetscFunctionReturn(PETSC_SUCCESS); 765 } 766 767 /* 768 Checks for missing diagonals 769 */ 770 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 771 { 772 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 773 PetscInt *diag, i; 774 775 PetscFunctionBegin; 776 *missing = PETSC_FALSE; 777 if (A->rmap->n > 0 && !a->colidx) { 778 *missing = PETSC_TRUE; 779 if (d) *d = 0; 780 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 781 } else { 782 diag = a->diag; 783 for (i = 0; i < A->rmap->n; i++) { 784 if (diag[i] == -1) { 785 *missing = PETSC_TRUE; 786 if (d) *d = i; 787 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 788 break; 789 } 790 } 791 } 792 PetscFunctionReturn(PETSC_SUCCESS); 793 } 794 795 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 796 { 797 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 798 PetscInt i, j, m = A->rmap->n, shift; 799 800 PetscFunctionBegin; 801 if (!a->diag) { 802 PetscCall(PetscMalloc1(m, &a->diag)); 803 a->free_diag = PETSC_TRUE; 804 } 805 for (i = 0; i < m; i++) { /* loop over rows */ 806 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 807 a->diag[i] = -1; 808 for (j = 0; j < a->rlen[i]; j++) { 809 if (a->colidx[shift + a->sliceheight * j] == i) { 810 a->diag[i] = shift + a->sliceheight * j; 811 break; 812 } 813 } 814 } 815 PetscFunctionReturn(PETSC_SUCCESS); 816 } 817 818 /* 819 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 820 */ 821 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 822 { 823 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 824 PetscInt i, *diag, m = A->rmap->n; 825 MatScalar *val = a->val; 826 PetscScalar *idiag, *mdiag; 827 828 PetscFunctionBegin; 829 if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS); 830 PetscCall(MatMarkDiagonal_SeqSELL(A)); 831 diag = a->diag; 832 if (!a->idiag) { 833 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 834 val = a->val; 835 } 836 mdiag = a->mdiag; 837 idiag = a->idiag; 838 839 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 840 for (i = 0; i < m; i++) { 841 mdiag[i] = val[diag[i]]; 842 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 843 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 844 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 845 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 846 A->factorerror_zeropivot_value = 0.0; 847 A->factorerror_zeropivot_row = i; 848 } 849 idiag[i] = 1.0 / val[diag[i]]; 850 } 851 PetscCall(PetscLogFlops(m)); 852 } else { 853 for (i = 0; i < m; i++) { 854 mdiag[i] = val[diag[i]]; 855 idiag[i] = omega / (fshift + val[diag[i]]); 856 } 857 PetscCall(PetscLogFlops(2.0 * m)); 858 } 859 a->idiagvalid = PETSC_TRUE; 860 PetscFunctionReturn(PETSC_SUCCESS); 861 } 862 863 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 864 { 865 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 866 867 PetscFunctionBegin; 868 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 869 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 870 PetscFunctionReturn(PETSC_SUCCESS); 871 } 872 873 PetscErrorCode MatDestroy_SeqSELL(Mat A) 874 { 875 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 876 877 PetscFunctionBegin; 878 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 879 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 880 PetscCall(ISDestroy(&a->row)); 881 PetscCall(ISDestroy(&a->col)); 882 PetscCall(PetscFree(a->diag)); 883 PetscCall(PetscFree(a->rlen)); 884 PetscCall(PetscFree(a->sliidx)); 885 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 886 PetscCall(PetscFree(a->solve_work)); 887 PetscCall(ISDestroy(&a->icol)); 888 PetscCall(PetscFree(a->saved_values)); 889 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 890 PetscCall(PetscFree(A->data)); 891 #if defined(PETSC_HAVE_CUPM) 892 PetscCall(PetscFree(a->chunk_slice_map)); 893 #endif 894 895 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 896 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 897 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 898 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 899 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 900 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 901 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 902 #if defined(PETSC_HAVE_CUDA) 903 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL)); 904 #endif 905 #if defined(PETSC_HAVE_HIP) 906 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellhip_C", NULL)); 907 #endif 908 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL)); 909 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL)); 910 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL)); 911 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL)); 912 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL)); 913 PetscFunctionReturn(PETSC_SUCCESS); 914 } 915 916 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 917 { 918 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 919 920 PetscFunctionBegin; 921 switch (op) { 922 case MAT_ROW_ORIENTED: 923 a->roworiented = flg; 924 break; 925 case MAT_KEEP_NONZERO_PATTERN: 926 a->keepnonzeropattern = flg; 927 break; 928 case MAT_NEW_NONZERO_LOCATIONS: 929 a->nonew = (flg ? 0 : 1); 930 break; 931 case MAT_NEW_NONZERO_LOCATION_ERR: 932 a->nonew = (flg ? -1 : 0); 933 break; 934 case MAT_NEW_NONZERO_ALLOCATION_ERR: 935 a->nonew = (flg ? -2 : 0); 936 break; 937 case MAT_UNUSED_NONZERO_LOCATION_ERR: 938 a->nounused = (flg ? -1 : 0); 939 break; 940 default: 941 break; 942 } 943 PetscFunctionReturn(PETSC_SUCCESS); 944 } 945 946 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 947 { 948 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 949 PetscInt i, j, n, shift; 950 PetscScalar *x, zero = 0.0; 951 952 PetscFunctionBegin; 953 PetscCall(VecGetLocalSize(v, &n)); 954 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 955 956 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 957 PetscInt *diag = a->diag; 958 PetscCall(VecGetArray(v, &x)); 959 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 960 PetscCall(VecRestoreArray(v, &x)); 961 PetscFunctionReturn(PETSC_SUCCESS); 962 } 963 964 PetscCall(VecSet(v, zero)); 965 PetscCall(VecGetArray(v, &x)); 966 for (i = 0; i < n; i++) { /* loop over rows */ 967 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 968 x[i] = 0; 969 for (j = 0; j < a->rlen[i]; j++) { 970 if (a->colidx[shift + a->sliceheight * j] == i) { 971 x[i] = a->val[shift + a->sliceheight * j]; 972 break; 973 } 974 } 975 } 976 PetscCall(VecRestoreArray(v, &x)); 977 PetscFunctionReturn(PETSC_SUCCESS); 978 } 979 980 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 981 { 982 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 983 const PetscScalar *l, *r; 984 PetscInt i, j, m, n, row; 985 986 PetscFunctionBegin; 987 if (ll) { 988 /* The local size is used so that VecMPI can be passed to this routine 989 by MatDiagonalScale_MPISELL */ 990 PetscCall(VecGetLocalSize(ll, &m)); 991 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 992 PetscCall(VecGetArrayRead(ll, &l)); 993 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 994 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 995 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 996 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row]; 997 } 998 } else { 999 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) a->val[j] *= l[a->sliceheight * i + row]; 1000 } 1001 } 1002 PetscCall(VecRestoreArrayRead(ll, &l)); 1003 PetscCall(PetscLogFlops(a->nz)); 1004 } 1005 if (rr) { 1006 PetscCall(VecGetLocalSize(rr, &n)); 1007 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1008 PetscCall(VecGetArrayRead(rr, &r)); 1009 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1010 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1011 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) { 1012 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]]; 1013 } 1014 } else { 1015 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 1016 } 1017 } 1018 PetscCall(VecRestoreArrayRead(rr, &r)); 1019 PetscCall(PetscLogFlops(a->nz)); 1020 } 1021 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1022 #if defined(PETSC_HAVE_CUPM) 1023 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1024 #endif 1025 PetscFunctionReturn(PETSC_SUCCESS); 1026 } 1027 1028 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1029 { 1030 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1031 PetscInt *cp, i, k, low, high, t, row, col, l; 1032 PetscInt shift; 1033 MatScalar *vp; 1034 1035 PetscFunctionBegin; 1036 for (k = 0; k < m; k++) { /* loop over requested rows */ 1037 row = im[k]; 1038 if (row < 0) continue; 1039 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1040 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */ 1041 cp = a->colidx + shift; /* pointer to the row */ 1042 vp = a->val + shift; /* pointer to the row */ 1043 for (l = 0; l < n; l++) { /* loop over requested columns */ 1044 col = in[l]; 1045 if (col < 0) continue; 1046 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1047 high = a->rlen[row]; 1048 low = 0; /* assume unsorted */ 1049 while (high - low > 5) { 1050 t = (low + high) / 2; 1051 if (*(cp + a->sliceheight * t) > col) high = t; 1052 else low = t; 1053 } 1054 for (i = low; i < high; i++) { 1055 if (*(cp + a->sliceheight * i) > col) break; 1056 if (*(cp + a->sliceheight * i) == col) { 1057 *v++ = *(vp + a->sliceheight * i); 1058 goto finished; 1059 } 1060 } 1061 *v++ = 0.0; 1062 finished:; 1063 } 1064 } 1065 PetscFunctionReturn(PETSC_SUCCESS); 1066 } 1067 1068 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1069 { 1070 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1071 PetscInt i, j, m = A->rmap->n, shift; 1072 const char *name; 1073 PetscViewerFormat format; 1074 1075 PetscFunctionBegin; 1076 PetscCall(PetscViewerGetFormat(viewer, &format)); 1077 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1078 PetscInt nofinalvalue = 0; 1079 /* 1080 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) nofinalvalue = 1; 1081 */ 1082 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1083 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1084 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1085 #if defined(PETSC_USE_COMPLEX) 1086 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1087 #else 1088 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1089 #endif 1090 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1091 1092 for (i = 0; i < m; i++) { 1093 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1094 for (j = 0; j < a->rlen[i]; j++) { 1095 #if defined(PETSC_USE_COMPLEX) 1096 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1097 #else 1098 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j])); 1099 #endif 1100 } 1101 } 1102 /* 1103 if (nofinalvalue) { 1104 #if defined(PETSC_USE_COMPLEX) 1105 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1106 #else 1107 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1108 #endif 1109 } 1110 */ 1111 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1112 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1113 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1114 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1115 PetscFunctionReturn(PETSC_SUCCESS); 1116 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1117 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1118 for (i = 0; i < m; i++) { 1119 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1120 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1121 for (j = 0; j < a->rlen[i]; j++) { 1122 #if defined(PETSC_USE_COMPLEX) 1123 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1124 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1125 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1126 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1127 } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1128 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1129 } 1130 #else 1131 if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1132 #endif 1133 } 1134 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1135 } 1136 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1137 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1138 PetscInt cnt = 0, jcnt; 1139 PetscScalar value; 1140 #if defined(PETSC_USE_COMPLEX) 1141 PetscBool realonly = PETSC_TRUE; 1142 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1143 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1144 realonly = PETSC_FALSE; 1145 break; 1146 } 1147 } 1148 #endif 1149 1150 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1151 for (i = 0; i < m; i++) { 1152 jcnt = 0; 1153 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1154 for (j = 0; j < A->cmap->n; j++) { 1155 if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) { 1156 value = a->val[cnt++]; 1157 jcnt++; 1158 } else { 1159 value = 0.0; 1160 } 1161 #if defined(PETSC_USE_COMPLEX) 1162 if (realonly) { 1163 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1164 } else { 1165 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1166 } 1167 #else 1168 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1169 #endif 1170 } 1171 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1172 } 1173 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1174 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1175 PetscInt fshift = 1; 1176 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1177 #if defined(PETSC_USE_COMPLEX) 1178 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1179 #else 1180 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1181 #endif 1182 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1183 for (i = 0; i < m; i++) { 1184 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1185 for (j = 0; j < a->rlen[i]; j++) { 1186 #if defined(PETSC_USE_COMPLEX) 1187 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1188 #else 1189 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j])); 1190 #endif 1191 } 1192 } 1193 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1194 } else if (format == PETSC_VIEWER_NATIVE) { 1195 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1196 PetscInt row; 1197 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1198 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1199 #if defined(PETSC_USE_COMPLEX) 1200 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1201 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1202 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1203 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1204 } else { 1205 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1206 } 1207 #else 1208 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j])); 1209 #endif 1210 } 1211 } 1212 } else { 1213 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1214 if (A->factortype) { 1215 for (i = 0; i < m; i++) { 1216 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1217 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1218 /* L part */ 1219 for (j = shift; j < a->diag[i]; j += a->sliceheight) { 1220 #if defined(PETSC_USE_COMPLEX) 1221 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) { 1222 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1223 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) { 1224 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1225 } else { 1226 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1227 } 1228 #else 1229 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1230 #endif 1231 } 1232 /* diagonal */ 1233 j = a->diag[i]; 1234 #if defined(PETSC_USE_COMPLEX) 1235 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1236 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1237 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1238 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1239 } else { 1240 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1241 } 1242 #else 1243 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1 / a->val[j]))); 1244 #endif 1245 1246 /* U part */ 1247 for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) { 1248 #if defined(PETSC_USE_COMPLEX) 1249 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1250 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1251 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1252 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1253 } else { 1254 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1255 } 1256 #else 1257 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1258 #endif 1259 } 1260 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1261 } 1262 } else { 1263 for (i = 0; i < m; i++) { 1264 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1265 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1266 for (j = 0; j < a->rlen[i]; j++) { 1267 #if defined(PETSC_USE_COMPLEX) 1268 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1269 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1270 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1271 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1272 } else { 1273 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1274 } 1275 #else 1276 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1277 #endif 1278 } 1279 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1280 } 1281 } 1282 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1283 } 1284 PetscCall(PetscViewerFlush(viewer)); 1285 PetscFunctionReturn(PETSC_SUCCESS); 1286 } 1287 1288 #include <petscdraw.h> 1289 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1290 { 1291 Mat A = (Mat)Aa; 1292 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1293 PetscInt i, j, m = A->rmap->n, shift; 1294 int color; 1295 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1296 PetscViewer viewer; 1297 PetscViewerFormat format; 1298 1299 PetscFunctionBegin; 1300 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1301 PetscCall(PetscViewerGetFormat(viewer, &format)); 1302 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1303 1304 /* loop over matrix elements drawing boxes */ 1305 1306 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1307 PetscDrawCollectiveBegin(draw); 1308 /* Blue for negative, Cyan for zero and Red for positive */ 1309 color = PETSC_DRAW_BLUE; 1310 for (i = 0; i < m; i++) { 1311 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1312 y_l = m - i - 1.0; 1313 y_r = y_l + 1.0; 1314 for (j = 0; j < a->rlen[i]; j++) { 1315 x_l = a->colidx[shift + a->sliceheight * j]; 1316 x_r = x_l + 1.0; 1317 if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue; 1318 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1319 } 1320 } 1321 color = PETSC_DRAW_CYAN; 1322 for (i = 0; i < m; i++) { 1323 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1324 y_l = m - i - 1.0; 1325 y_r = y_l + 1.0; 1326 for (j = 0; j < a->rlen[i]; j++) { 1327 x_l = a->colidx[shift + a->sliceheight * j]; 1328 x_r = x_l + 1.0; 1329 if (a->val[shift + a->sliceheight * j] != 0.) continue; 1330 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1331 } 1332 } 1333 color = PETSC_DRAW_RED; 1334 for (i = 0; i < m; i++) { 1335 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1336 y_l = m - i - 1.0; 1337 y_r = y_l + 1.0; 1338 for (j = 0; j < a->rlen[i]; j++) { 1339 x_l = a->colidx[shift + a->sliceheight * j]; 1340 x_r = x_l + 1.0; 1341 if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue; 1342 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1343 } 1344 } 1345 PetscDrawCollectiveEnd(draw); 1346 } else { 1347 /* use contour shading to indicate magnitude of values */ 1348 /* first determine max of all nonzero values */ 1349 PetscReal minv = 0.0, maxv = 0.0; 1350 PetscInt count = 0; 1351 PetscDraw popup; 1352 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1353 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1354 } 1355 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1356 PetscCall(PetscDrawGetPopup(draw, &popup)); 1357 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1358 1359 PetscDrawCollectiveBegin(draw); 1360 for (i = 0; i < m; i++) { 1361 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1362 y_l = m - i - 1.0; 1363 y_r = y_l + 1.0; 1364 for (j = 0; j < a->rlen[i]; j++) { 1365 x_l = a->colidx[shift + a->sliceheight * j]; 1366 x_r = x_l + 1.0; 1367 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1368 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1369 count++; 1370 } 1371 } 1372 PetscDrawCollectiveEnd(draw); 1373 } 1374 PetscFunctionReturn(PETSC_SUCCESS); 1375 } 1376 1377 #include <petscdraw.h> 1378 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1379 { 1380 PetscDraw draw; 1381 PetscReal xr, yr, xl, yl, h, w; 1382 PetscBool isnull; 1383 1384 PetscFunctionBegin; 1385 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1386 PetscCall(PetscDrawIsNull(draw, &isnull)); 1387 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1388 1389 xr = A->cmap->n; 1390 yr = A->rmap->n; 1391 h = yr / 10.0; 1392 w = xr / 10.0; 1393 xr += w; 1394 yr += h; 1395 xl = -w; 1396 yl = -h; 1397 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1398 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1399 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1400 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1401 PetscCall(PetscDrawSave(draw)); 1402 PetscFunctionReturn(PETSC_SUCCESS); 1403 } 1404 1405 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1406 { 1407 PetscBool isascii, isbinary, isdraw; 1408 1409 PetscFunctionBegin; 1410 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii)); 1411 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1412 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1413 if (isascii) { 1414 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1415 } else if (isbinary) { 1416 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1417 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1418 PetscFunctionReturn(PETSC_SUCCESS); 1419 } 1420 1421 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1422 { 1423 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1424 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1425 MatScalar *vp; 1426 #if defined(PETSC_HAVE_CUPM) 1427 PetscInt totalchunks = 0; 1428 #endif 1429 1430 PetscFunctionBegin; 1431 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1432 /* To do: compress out the unused elements */ 1433 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1434 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1435 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1436 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1437 a->nonzerorowcnt = 0; 1438 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1439 for (i = 0; i < a->totalslices; ++i) { 1440 shift = a->sliidx[i]; /* starting index of the slice */ 1441 cp = PetscSafePointerPlusOffset(a->colidx, shift); /* pointer to the column indices of the slice */ 1442 vp = PetscSafePointerPlusOffset(a->val, shift); /* pointer to the nonzero values of the slice */ 1443 for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */ 1444 row = a->sliceheight * i + row_in_slice; 1445 nrow = a->rlen[row]; /* number of nonzeros in row */ 1446 /* 1447 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1448 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1449 */ 1450 lastcol = 0; 1451 if (nrow > 0) { /* nonempty row */ 1452 a->nonzerorowcnt++; 1453 lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1454 } else if (!row_in_slice) { /* first row of the correct slice is empty */ 1455 for (j = 1; j < a->sliceheight; j++) { 1456 if (a->rlen[a->sliceheight * i + j]) { 1457 lastcol = cp[j]; 1458 break; 1459 } 1460 } 1461 } else { 1462 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1463 } 1464 1465 for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) { 1466 cp[a->sliceheight * k + row_in_slice] = lastcol; 1467 vp[a->sliceheight * k + row_in_slice] = (MatScalar)0; 1468 } 1469 } 1470 } 1471 1472 A->info.mallocs += a->reallocs; 1473 a->reallocs = 0; 1474 1475 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1476 #if defined(PETSC_HAVE_CUPM) 1477 if (!a->chunksize && a->totalslices) { 1478 a->chunksize = 64; 1479 while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2; 1480 totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize; 1481 } 1482 if (totalchunks != a->totalchunks) { 1483 PetscCall(PetscFree(a->chunk_slice_map)); 1484 PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map)); 1485 a->totalchunks = totalchunks; 1486 } 1487 j = 0; 1488 for (i = 0; i < totalchunks; i++) { 1489 while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++; 1490 a->chunk_slice_map[i] = j; 1491 } 1492 #endif 1493 PetscFunctionReturn(PETSC_SUCCESS); 1494 } 1495 1496 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1497 { 1498 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1499 1500 PetscFunctionBegin; 1501 info->block_size = 1.0; 1502 info->nz_allocated = a->maxallocmat; 1503 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1504 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1505 info->assemblies = A->num_ass; 1506 info->mallocs = A->info.mallocs; 1507 info->memory = 0; /* REVIEW ME */ 1508 if (A->factortype) { 1509 info->fill_ratio_given = A->info.fill_ratio_given; 1510 info->fill_ratio_needed = A->info.fill_ratio_needed; 1511 info->factor_mallocs = A->info.factor_mallocs; 1512 } else { 1513 info->fill_ratio_given = 0; 1514 info->fill_ratio_needed = 0; 1515 info->factor_mallocs = 0; 1516 } 1517 PetscFunctionReturn(PETSC_SUCCESS); 1518 } 1519 1520 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1521 { 1522 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1523 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1524 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1525 MatScalar *vp, value; 1526 #if defined(PETSC_HAVE_CUPM) 1527 PetscBool inserted = PETSC_FALSE; 1528 PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight; 1529 #endif 1530 1531 PetscFunctionBegin; 1532 for (k = 0; k < m; k++) { /* loop over added rows */ 1533 row = im[k]; 1534 if (row < 0) continue; 1535 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1536 shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */ 1537 cp = a->colidx + shift; /* pointer to the row */ 1538 vp = a->val + shift; /* pointer to the row */ 1539 nrow = a->rlen[row]; 1540 low = 0; 1541 high = nrow; 1542 1543 for (l = 0; l < n; l++) { /* loop over added columns */ 1544 col = in[l]; 1545 if (col < 0) continue; 1546 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1547 if (a->roworiented) { 1548 value = v[l + k * n]; 1549 } else { 1550 value = v[k + l * m]; 1551 } 1552 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1553 1554 /* search in this row for the specified column, i indicates the column to be set */ 1555 if (col <= lastcol) low = 0; 1556 else high = nrow; 1557 lastcol = col; 1558 while (high - low > 5) { 1559 t = (low + high) / 2; 1560 if (*(cp + a->sliceheight * t) > col) high = t; 1561 else low = t; 1562 } 1563 for (i = low; i < high; i++) { 1564 if (*(cp + a->sliceheight * i) > col) break; 1565 if (*(cp + a->sliceheight * i) == col) { 1566 if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value; 1567 else *(vp + a->sliceheight * i) = value; 1568 #if defined(PETSC_HAVE_CUPM) 1569 inserted = PETSC_TRUE; 1570 #endif 1571 low = i + 1; 1572 goto noinsert; 1573 } 1574 } 1575 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1576 if (nonew == 1) goto noinsert; 1577 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1578 #if defined(PETSC_HAVE_CUPM) 1579 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul); 1580 #else 1581 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1582 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1); 1583 #endif 1584 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1585 for (ii = nrow - 1; ii >= i; ii--) { 1586 *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); 1587 *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); 1588 } 1589 a->rlen[row]++; 1590 *(cp + a->sliceheight * i) = col; 1591 *(vp + a->sliceheight * i) = value; 1592 a->nz++; 1593 #if defined(PETSC_HAVE_CUPM) 1594 inserted = PETSC_TRUE; 1595 #endif 1596 low = i + 1; 1597 high++; 1598 nrow++; 1599 noinsert:; 1600 } 1601 a->rlen[row] = nrow; 1602 } 1603 #if defined(PETSC_HAVE_CUPM) 1604 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU; 1605 #endif 1606 PetscFunctionReturn(PETSC_SUCCESS); 1607 } 1608 1609 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1610 { 1611 PetscFunctionBegin; 1612 /* If the two matrices have the same copy implementation, use fast copy. */ 1613 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1614 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1615 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1616 1617 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1618 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1619 } else { 1620 PetscCall(MatCopy_Basic(A, B, str)); 1621 } 1622 PetscFunctionReturn(PETSC_SUCCESS); 1623 } 1624 1625 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1626 { 1627 PetscFunctionBegin; 1628 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1629 PetscFunctionReturn(PETSC_SUCCESS); 1630 } 1631 1632 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1633 { 1634 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1635 1636 PetscFunctionBegin; 1637 *array = a->val; 1638 PetscFunctionReturn(PETSC_SUCCESS); 1639 } 1640 1641 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1642 { 1643 PetscFunctionBegin; 1644 PetscFunctionReturn(PETSC_SUCCESS); 1645 } 1646 1647 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1648 { 1649 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1650 MatScalar *aval = a->val; 1651 PetscScalar oalpha = alpha; 1652 PetscBLASInt one = 1, size; 1653 1654 PetscFunctionBegin; 1655 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1656 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1657 PetscCall(PetscLogFlops(a->nz)); 1658 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1659 #if defined(PETSC_HAVE_CUPM) 1660 if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU; 1661 #endif 1662 PetscFunctionReturn(PETSC_SUCCESS); 1663 } 1664 1665 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1666 { 1667 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1668 1669 PetscFunctionBegin; 1670 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1671 PetscCall(MatShift_Basic(Y, a)); 1672 PetscFunctionReturn(PETSC_SUCCESS); 1673 } 1674 1675 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1676 { 1677 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1678 PetscScalar *x, sum, *t; 1679 const MatScalar *idiag = NULL, *mdiag; 1680 const PetscScalar *b, *xb; 1681 PetscInt n, m = A->rmap->n, i, j, shift; 1682 const PetscInt *diag; 1683 1684 PetscFunctionBegin; 1685 its = its * lits; 1686 1687 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1688 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1689 a->fshift = fshift; 1690 a->omega = omega; 1691 1692 diag = a->diag; 1693 t = a->ssor_work; 1694 idiag = a->idiag; 1695 mdiag = a->mdiag; 1696 1697 PetscCall(VecGetArray(xx, &x)); 1698 PetscCall(VecGetArrayRead(bb, &b)); 1699 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1700 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1701 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1702 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1703 1704 if (flag & SOR_ZERO_INITIAL_GUESS) { 1705 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1706 for (i = 0; i < m; i++) { 1707 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1708 sum = b[i]; 1709 n = (diag[i] - shift) / a->sliceheight; 1710 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1711 t[i] = sum; 1712 x[i] = sum * idiag[i]; 1713 } 1714 xb = t; 1715 PetscCall(PetscLogFlops(a->nz)); 1716 } else xb = b; 1717 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1718 for (i = m - 1; i >= 0; i--) { 1719 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1720 sum = xb[i]; 1721 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1722 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1723 if (xb == b) { 1724 x[i] = sum * idiag[i]; 1725 } else { 1726 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1727 } 1728 } 1729 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1730 } 1731 its--; 1732 } 1733 while (its--) { 1734 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1735 for (i = 0; i < m; i++) { 1736 /* lower */ 1737 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1738 sum = b[i]; 1739 n = (diag[i] - shift) / a->sliceheight; 1740 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1741 t[i] = sum; /* save application of the lower-triangular part */ 1742 /* upper */ 1743 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1744 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1745 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1746 } 1747 xb = t; 1748 PetscCall(PetscLogFlops(2.0 * a->nz)); 1749 } else xb = b; 1750 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1751 for (i = m - 1; i >= 0; i--) { 1752 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1753 sum = xb[i]; 1754 if (xb == b) { 1755 /* whole matrix (no checkpointing available) */ 1756 n = a->rlen[i]; 1757 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1758 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1759 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1760 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1761 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1762 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1763 } 1764 } 1765 if (xb == b) { 1766 PetscCall(PetscLogFlops(2.0 * a->nz)); 1767 } else { 1768 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1769 } 1770 } 1771 } 1772 PetscCall(VecRestoreArray(xx, &x)); 1773 PetscCall(VecRestoreArrayRead(bb, &b)); 1774 PetscFunctionReturn(PETSC_SUCCESS); 1775 } 1776 1777 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1778 MatGetRow_SeqSELL, 1779 MatRestoreRow_SeqSELL, 1780 MatMult_SeqSELL, 1781 /* 4*/ MatMultAdd_SeqSELL, 1782 MatMultTranspose_SeqSELL, 1783 MatMultTransposeAdd_SeqSELL, 1784 NULL, 1785 NULL, 1786 NULL, 1787 /* 10*/ NULL, 1788 NULL, 1789 NULL, 1790 MatSOR_SeqSELL, 1791 NULL, 1792 /* 15*/ MatGetInfo_SeqSELL, 1793 MatEqual_SeqSELL, 1794 MatGetDiagonal_SeqSELL, 1795 MatDiagonalScale_SeqSELL, 1796 NULL, 1797 /* 20*/ NULL, 1798 MatAssemblyEnd_SeqSELL, 1799 MatSetOption_SeqSELL, 1800 MatZeroEntries_SeqSELL, 1801 /* 24*/ NULL, 1802 NULL, 1803 NULL, 1804 NULL, 1805 NULL, 1806 /* 29*/ MatSetUp_SeqSELL, 1807 NULL, 1808 NULL, 1809 NULL, 1810 NULL, 1811 /* 34*/ MatDuplicate_SeqSELL, 1812 NULL, 1813 NULL, 1814 NULL, 1815 NULL, 1816 /* 39*/ NULL, 1817 NULL, 1818 NULL, 1819 MatGetValues_SeqSELL, 1820 MatCopy_SeqSELL, 1821 /* 44*/ NULL, 1822 MatScale_SeqSELL, 1823 MatShift_SeqSELL, 1824 NULL, 1825 NULL, 1826 /* 49*/ NULL, 1827 NULL, 1828 NULL, 1829 NULL, 1830 NULL, 1831 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1832 NULL, 1833 NULL, 1834 NULL, 1835 NULL, 1836 /* 59*/ NULL, 1837 MatDestroy_SeqSELL, 1838 MatView_SeqSELL, 1839 NULL, 1840 NULL, 1841 /* 64*/ NULL, 1842 NULL, 1843 NULL, 1844 NULL, 1845 NULL, 1846 /* 69*/ NULL, 1847 NULL, 1848 NULL, 1849 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1850 NULL, 1851 /* 74*/ NULL, 1852 NULL, 1853 NULL, 1854 NULL, 1855 NULL, 1856 /* 79*/ NULL, 1857 NULL, 1858 NULL, 1859 NULL, 1860 NULL, 1861 /* 84*/ NULL, 1862 NULL, 1863 NULL, 1864 NULL, 1865 NULL, 1866 /* 89*/ NULL, 1867 NULL, 1868 NULL, 1869 NULL, 1870 MatConjugate_SeqSELL, 1871 /* 94*/ NULL, 1872 NULL, 1873 NULL, 1874 NULL, 1875 NULL, 1876 /* 99*/ NULL, 1877 NULL, 1878 NULL, 1879 NULL, 1880 NULL, 1881 /*104*/ MatMissingDiagonal_SeqSELL, 1882 NULL, 1883 NULL, 1884 NULL, 1885 NULL, 1886 /*109*/ NULL, 1887 NULL, 1888 NULL, 1889 NULL, 1890 NULL, 1891 /*114*/ NULL, 1892 NULL, 1893 NULL, 1894 NULL, 1895 NULL, 1896 /*119*/ NULL, 1897 NULL, 1898 NULL, 1899 NULL, 1900 NULL, 1901 /*124*/ NULL, 1902 NULL, 1903 NULL, 1904 NULL, 1905 NULL, 1906 /*129*/ MatFDColoringSetUp_SeqXAIJ, 1907 NULL, 1908 NULL, 1909 NULL, 1910 NULL, 1911 /*134*/ NULL, 1912 NULL, 1913 NULL, 1914 NULL, 1915 NULL, 1916 /*139*/ NULL, 1917 NULL, 1918 NULL, 1919 NULL, 1920 NULL, 1921 NULL}; 1922 1923 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1924 { 1925 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1926 1927 PetscFunctionBegin; 1928 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1929 1930 /* allocate space for values if not already there */ 1931 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1932 1933 /* copy values over */ 1934 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1935 PetscFunctionReturn(PETSC_SUCCESS); 1936 } 1937 1938 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1939 { 1940 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1941 1942 PetscFunctionBegin; 1943 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1944 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1945 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1946 PetscFunctionReturn(PETSC_SUCCESS); 1947 } 1948 1949 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio) 1950 { 1951 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1952 1953 PetscFunctionBegin; 1954 if (a->totalslices && a->sliidx[a->totalslices]) { 1955 *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices]; 1956 } else { 1957 *ratio = 0.0; 1958 } 1959 PetscFunctionReturn(PETSC_SUCCESS); 1960 } 1961 1962 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth) 1963 { 1964 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1965 PetscInt i, current_slicewidth; 1966 1967 PetscFunctionBegin; 1968 *slicewidth = 0; 1969 for (i = 0; i < a->totalslices; i++) { 1970 current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight; 1971 if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth; 1972 } 1973 PetscFunctionReturn(PETSC_SUCCESS); 1974 } 1975 1976 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth) 1977 { 1978 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1979 1980 PetscFunctionBegin; 1981 *slicewidth = 0; 1982 if (a->totalslices) *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; 1983 PetscFunctionReturn(PETSC_SUCCESS); 1984 } 1985 1986 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance) 1987 { 1988 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1989 PetscReal mean; 1990 PetscInt i, totalslices = a->totalslices, *sliidx = a->sliidx; 1991 1992 PetscFunctionBegin; 1993 *variance = 0; 1994 if (totalslices) { 1995 mean = (PetscReal)sliidx[totalslices] / totalslices; 1996 for (i = 1; i <= totalslices; i++) *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; 1997 } 1998 PetscFunctionReturn(PETSC_SUCCESS); 1999 } 2000 2001 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight) 2002 { 2003 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2004 2005 PetscFunctionBegin; 2006 if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS); 2007 PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight); 2008 a->sliceheight = sliceheight; 2009 #if defined(PETSC_HAVE_CUPM) 2010 PetscCheck(PetscMax(DEVICE_MEM_ALIGN, sliceheight) % PetscMin(DEVICE_MEM_ALIGN, sliceheight) == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The slice height is not compatible with DEVICE_MEM_ALIGN (one must be divisible by the other) %" PetscInt_FMT, sliceheight); 2011 #endif 2012 PetscFunctionReturn(PETSC_SUCCESS); 2013 } 2014 2015 /*@ 2016 MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix. 2017 2018 Not Collective 2019 2020 Input Parameter: 2021 . A - a MATSEQSELL matrix 2022 2023 Output Parameter: 2024 . ratio - ratio of number of padded zeros to number of allocated elements 2025 2026 Level: intermediate 2027 2028 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2029 @*/ 2030 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio) 2031 { 2032 PetscFunctionBegin; 2033 PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio)); 2034 PetscFunctionReturn(PETSC_SUCCESS); 2035 } 2036 2037 /*@ 2038 MatSeqSELLGetMaxSliceWidth - returns the maximum slice width. 2039 2040 Not Collective 2041 2042 Input Parameter: 2043 . A - a MATSEQSELL matrix 2044 2045 Output Parameter: 2046 . slicewidth - maximum slice width 2047 2048 Level: intermediate 2049 2050 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2051 @*/ 2052 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth) 2053 { 2054 PetscFunctionBegin; 2055 PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth)); 2056 PetscFunctionReturn(PETSC_SUCCESS); 2057 } 2058 2059 /*@ 2060 MatSeqSELLGetAvgSliceWidth - returns the average slice width. 2061 2062 Not Collective 2063 2064 Input Parameter: 2065 . A - a MATSEQSELL matrix 2066 2067 Output Parameter: 2068 . slicewidth - average slice width 2069 2070 Level: intermediate 2071 2072 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()` 2073 @*/ 2074 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth) 2075 { 2076 PetscFunctionBegin; 2077 PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth)); 2078 PetscFunctionReturn(PETSC_SUCCESS); 2079 } 2080 2081 /*@ 2082 MatSeqSELLSetSliceHeight - sets the slice height. 2083 2084 Not Collective 2085 2086 Input Parameters: 2087 + A - a MATSEQSELL matrix 2088 - sliceheight - slice height 2089 2090 Notes: 2091 You cannot change the slice height once it have been set. 2092 2093 The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called. 2094 2095 Level: intermediate 2096 2097 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()` 2098 @*/ 2099 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight) 2100 { 2101 PetscFunctionBegin; 2102 PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight)); 2103 PetscFunctionReturn(PETSC_SUCCESS); 2104 } 2105 2106 /*@ 2107 MatSeqSELLGetVarSliceSize - returns the variance of the slice size. 2108 2109 Not Collective 2110 2111 Input Parameter: 2112 . A - a MATSEQSELL matrix 2113 2114 Output Parameter: 2115 . variance - variance of the slice size 2116 2117 Level: intermediate 2118 2119 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()` 2120 @*/ 2121 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance) 2122 { 2123 PetscFunctionBegin; 2124 PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance)); 2125 PetscFunctionReturn(PETSC_SUCCESS); 2126 } 2127 2128 #if defined(PETSC_HAVE_CUDA) 2129 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 2130 #endif 2131 #if defined(PETSC_HAVE_HIP) 2132 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat); 2133 #endif 2134 2135 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 2136 { 2137 Mat_SeqSELL *b; 2138 PetscMPIInt size; 2139 2140 PetscFunctionBegin; 2141 PetscCall(PetscCitationsRegister(citation, &cited)); 2142 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 2143 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 2144 2145 PetscCall(PetscNew(&b)); 2146 2147 B->data = (void *)b; 2148 B->ops[0] = MatOps_Values; 2149 2150 b->row = NULL; 2151 b->col = NULL; 2152 b->icol = NULL; 2153 b->reallocs = 0; 2154 b->ignorezeroentries = PETSC_FALSE; 2155 b->roworiented = PETSC_TRUE; 2156 b->nonew = 0; 2157 b->diag = NULL; 2158 b->solve_work = NULL; 2159 B->spptr = NULL; 2160 b->saved_values = NULL; 2161 b->idiag = NULL; 2162 b->mdiag = NULL; 2163 b->ssor_work = NULL; 2164 b->omega = 1.0; 2165 b->fshift = 0.0; 2166 b->idiagvalid = PETSC_FALSE; 2167 b->keepnonzeropattern = PETSC_FALSE; 2168 b->sliceheight = 0; 2169 2170 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 2171 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 2172 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 2173 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 2174 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 2175 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 2176 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 2177 #if defined(PETSC_HAVE_CUDA) 2178 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA)); 2179 #endif 2180 #if defined(PETSC_HAVE_HIP) 2181 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellhip_C", MatConvert_SeqSELL_SeqSELLHIP)); 2182 #endif 2183 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL)); 2184 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL)); 2185 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL)); 2186 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL)); 2187 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL)); 2188 2189 PetscObjectOptionsBegin((PetscObject)B); 2190 { 2191 PetscInt newsh = -1; 2192 PetscBool flg; 2193 #if defined(PETSC_HAVE_CUPM) 2194 PetscInt chunksize = 0; 2195 #endif 2196 2197 PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg)); 2198 if (flg) PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); 2199 #if defined(PETSC_HAVE_CUPM) 2200 PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA/HIP kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg)); 2201 if (flg) { 2202 PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize); 2203 b->chunksize = chunksize; 2204 } 2205 #endif 2206 } 2207 PetscOptionsEnd(); 2208 PetscFunctionReturn(PETSC_SUCCESS); 2209 } 2210 2211 /* 2212 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 2213 */ 2214 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 2215 { 2216 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 2217 PetscInt i, m = A->rmap->n; 2218 PetscInt totalslices = a->totalslices; 2219 2220 PetscFunctionBegin; 2221 C->factortype = A->factortype; 2222 c->row = NULL; 2223 c->col = NULL; 2224 c->icol = NULL; 2225 c->reallocs = 0; 2226 C->assembled = PETSC_TRUE; 2227 2228 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2229 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2230 2231 c->sliceheight = a->sliceheight; 2232 PetscCall(PetscMalloc1(c->sliceheight * totalslices, &c->rlen)); 2233 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2234 2235 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2236 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2237 2238 /* allocate the matrix space */ 2239 if (mallocmatspace) { 2240 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2241 2242 c->singlemalloc = PETSC_TRUE; 2243 2244 if (m > 0) { 2245 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2246 if (cpvalues == MAT_COPY_VALUES) { 2247 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2248 } else { 2249 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2250 } 2251 } 2252 } 2253 2254 c->ignorezeroentries = a->ignorezeroentries; 2255 c->roworiented = a->roworiented; 2256 c->nonew = a->nonew; 2257 if (a->diag) { 2258 PetscCall(PetscMalloc1(m, &c->diag)); 2259 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2260 } else c->diag = NULL; 2261 2262 c->solve_work = NULL; 2263 c->saved_values = NULL; 2264 c->idiag = NULL; 2265 c->ssor_work = NULL; 2266 c->keepnonzeropattern = a->keepnonzeropattern; 2267 c->free_val = PETSC_TRUE; 2268 c->free_colidx = PETSC_TRUE; 2269 2270 c->maxallocmat = a->maxallocmat; 2271 c->maxallocrow = a->maxallocrow; 2272 c->rlenmax = a->rlenmax; 2273 c->nz = a->nz; 2274 C->preallocated = PETSC_TRUE; 2275 2276 c->nonzerorowcnt = a->nonzerorowcnt; 2277 C->nonzerostate = A->nonzerostate; 2278 2279 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2280 PetscFunctionReturn(PETSC_SUCCESS); 2281 } 2282 2283 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2284 { 2285 PetscFunctionBegin; 2286 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2287 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2288 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2289 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2290 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2291 PetscFunctionReturn(PETSC_SUCCESS); 2292 } 2293 2294 /*MC 2295 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2296 based on the sliced Ellpack format, {cite}`zhangellpack2018` 2297 2298 Options Database Key: 2299 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2300 2301 Level: beginner 2302 2303 .seealso: `Mat`, `MatCreateSeqSELL()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2304 M*/ 2305 2306 /*MC 2307 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018` 2308 2309 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2310 and `MATMPISELL` otherwise. As a result, for single process communicators, 2311 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2312 for communicators controlling multiple processes. It is recommended that you call both of 2313 the above preallocation routines for simplicity. 2314 2315 Options Database Key: 2316 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2317 2318 Level: beginner 2319 2320 Notes: 2321 This format is only supported for real scalars, double precision, and 32-bit indices (the defaults). 2322 2323 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2324 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2325 2326 Developer Notes: 2327 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2328 2329 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2330 .vb 2331 (2 0 3 4) 2332 Consider the matrix A = (5 0 6 0) 2333 (0 0 7 8) 2334 (0 0 9 9) 2335 2336 symbolically the Ellpack format can be written as 2337 2338 (2 3 4 |) (0 2 3 |) 2339 v = (5 6 0 |) colidx = (0 2 2 |) 2340 -------- --------- 2341 (7 8 |) (2 3 |) 2342 (9 9 |) (2 3 |) 2343 2344 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2345 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2346 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2347 2348 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2349 2350 .ve 2351 2352 See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance. 2353 2354 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSELL()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2355 M*/ 2356 2357 /*@ 2358 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2359 2360 Collective 2361 2362 Input Parameters: 2363 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2364 . m - number of rows 2365 . n - number of columns 2366 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided 2367 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL 2368 2369 Output Parameter: 2370 . A - the matrix 2371 2372 Level: intermediate 2373 2374 Notes: 2375 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2376 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2377 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2378 2379 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 2380 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 2381 allocation. 2382 2383 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL` 2384 @*/ 2385 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A) 2386 { 2387 PetscFunctionBegin; 2388 PetscCall(MatCreate(comm, A)); 2389 PetscCall(MatSetSizes(*A, m, n, m, n)); 2390 PetscCall(MatSetType(*A, MATSEQSELL)); 2391 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen)); 2392 PetscFunctionReturn(PETSC_SUCCESS); 2393 } 2394 2395 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2396 { 2397 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2398 PetscInt totalslices = a->totalslices; 2399 2400 PetscFunctionBegin; 2401 /* If the matrix dimensions are not equal,or no of nonzeros */ 2402 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2403 *flg = PETSC_FALSE; 2404 PetscFunctionReturn(PETSC_SUCCESS); 2405 } 2406 /* if the a->colidx are the same */ 2407 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2408 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2409 /* if a->val are the same */ 2410 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2411 PetscFunctionReturn(PETSC_SUCCESS); 2412 } 2413 2414 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2415 { 2416 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2417 2418 PetscFunctionBegin; 2419 a->idiagvalid = PETSC_FALSE; 2420 PetscFunctionReturn(PETSC_SUCCESS); 2421 } 2422 2423 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2424 { 2425 #if defined(PETSC_USE_COMPLEX) 2426 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2427 PetscInt i; 2428 PetscScalar *val = a->val; 2429 2430 PetscFunctionBegin; 2431 for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]); 2432 #if defined(PETSC_HAVE_CUPM) 2433 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 2434 #endif 2435 #else 2436 PetscFunctionBegin; 2437 #endif 2438 PetscFunctionReturn(PETSC_SUCCESS); 2439 } 2440