1 /* 2 Defines the basic matrix operations for the SELL matrix storage format. 3 */ 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 #include <petscblaslapack.h> 6 #include <petsc/private/kernels/blocktranspose.h> 7 8 static PetscBool cited = PETSC_FALSE; 9 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 10 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 11 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 12 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 13 " year = 2018\n" 14 "}\n"; 15 16 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 17 18 #include <immintrin.h> 19 20 #if !defined(_MM_SCALE_8) 21 #define _MM_SCALE_8 8 22 #endif 23 24 #if defined(__AVX512F__) 25 /* these do not work 26 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 27 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 28 */ 29 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 30 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 31 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 32 vec_vals = _mm512_loadu_pd(aval); \ 33 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 34 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 35 #elif defined(__AVX2__) && defined(__FMA__) 36 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 37 vec_vals = _mm256_loadu_pd(aval); \ 38 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 39 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 40 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 41 #endif 42 #endif /* PETSC_HAVE_IMMINTRIN_H */ 43 44 /*@ 45 MatSeqSELLSetPreallocation - For good matrix assembly performance 46 the user should preallocate the matrix storage by setting the parameter `nz` 47 (or the array `nnz`). 48 49 Collective 50 51 Input Parameters: 52 + B - The `MATSEQSELL` matrix 53 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided 54 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 55 56 Level: intermediate 57 58 Notes: 59 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 60 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 61 allocation. 62 63 You can call `MatGetInfo()` to get information on how effective the preallocation was; 64 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 65 You can also run with the option `-info` and look for messages with the string 66 malloc in them to see if additional memory allocation was needed. 67 68 Developer Notes: 69 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 70 entries or columns indices. 71 72 The maximum number of nonzeos in any row should be as accurate as possible. 73 If it is underestimated, you will get bad performance due to reallocation 74 (`MatSeqXSELLReallocateSELL()`). 75 76 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 77 @*/ 78 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 79 { 80 PetscFunctionBegin; 81 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 82 PetscValidType(B, 1); 83 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 84 PetscFunctionReturn(PETSC_SUCCESS); 85 } 86 87 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 88 { 89 Mat_SeqSELL *b; 90 PetscInt i, j, totalslices; 91 #if defined(PETSC_HAVE_CUPM) 92 PetscInt rlenmax = 0; 93 #endif 94 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 95 96 PetscFunctionBegin; 97 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 98 if (maxallocrow == MAT_SKIP_ALLOCATION) { 99 skipallocation = PETSC_TRUE; 100 maxallocrow = 0; 101 } 102 103 PetscCall(PetscLayoutSetUp(B->rmap)); 104 PetscCall(PetscLayoutSetUp(B->cmap)); 105 106 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 107 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 108 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 109 if (rlen) { 110 for (i = 0; i < B->rmap->n; i++) { 111 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 112 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 113 } 114 } 115 116 B->preallocated = PETSC_TRUE; 117 118 b = (Mat_SeqSELL *)B->data; 119 120 if (!b->sliceheight) { /* not set yet */ 121 #if defined(PETSC_HAVE_CUPM) 122 b->sliceheight = 16; 123 #else 124 b->sliceheight = 8; 125 #endif 126 } 127 totalslices = PetscCeilInt(B->rmap->n, b->sliceheight); 128 b->totalslices = totalslices; 129 if (!skipallocation) { 130 if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n)); 131 132 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 133 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 134 } 135 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 136 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 137 else if (maxallocrow < 0) maxallocrow = 1; 138 #if defined(PETSC_HAVE_CUPM) 139 rlenmax = maxallocrow; 140 /* Pad the slice to DEVICE_MEM_ALIGN */ 141 while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++; 142 #endif 143 for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow; 144 } else { 145 #if defined(PETSC_HAVE_CUPM) 146 PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight; 147 #endif 148 maxallocrow = 0; 149 b->sliidx[0] = 0; 150 for (i = 1; i < totalslices; i++) { 151 b->sliidx[i] = 0; 152 for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); } 153 #if defined(PETSC_HAVE_CUPM) 154 if (mul != 0) { /* Pad the slice to DEVICE_MEM_ALIGN if sliceheight < DEVICE_MEM_ALIGN */ 155 rlenmax = PetscMax(b->sliidx[i], rlenmax); 156 b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul; 157 } 158 #endif 159 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 160 PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i])); 161 } 162 /* last slice */ 163 b->sliidx[totalslices] = 0; 164 for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 165 #if defined(PETSC_HAVE_CUPM) 166 if (mul != 0) { 167 rlenmax = PetscMax(b->sliidx[i], rlenmax); 168 b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul; 169 } 170 #endif 171 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 172 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices]; 173 } 174 175 /* allocate space for val, colidx, rlen */ 176 /* FIXME: should B's old memory be unlogged? */ 177 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 178 /* FIXME: assuming an element of the bit array takes 8 bits */ 179 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 180 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 181 PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen)); 182 183 b->singlemalloc = PETSC_TRUE; 184 b->free_val = PETSC_TRUE; 185 b->free_colidx = PETSC_TRUE; 186 } else { 187 b->free_val = PETSC_FALSE; 188 b->free_colidx = PETSC_FALSE; 189 } 190 191 b->nz = 0; 192 b->maxallocrow = maxallocrow; 193 #if defined(PETSC_HAVE_CUPM) 194 b->rlenmax = rlenmax; 195 #else 196 b->rlenmax = maxallocrow; 197 #endif 198 b->maxallocmat = b->sliidx[totalslices]; 199 B->info.nz_unneeded = (double)b->maxallocmat; 200 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 201 PetscFunctionReturn(PETSC_SUCCESS); 202 } 203 204 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 205 { 206 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 207 PetscInt shift; 208 209 PetscFunctionBegin; 210 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 211 if (nz) *nz = a->rlen[row]; 212 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); 213 if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); } 214 if (idx) { 215 PetscInt j; 216 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j]; 217 *idx = a->getrowcols; 218 } 219 if (v) { 220 PetscInt j; 221 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j]; 222 *v = a->getrowvals; 223 } 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 228 { 229 PetscFunctionBegin; 230 PetscFunctionReturn(PETSC_SUCCESS); 231 } 232 233 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 234 { 235 Mat B; 236 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 237 PetscInt i; 238 239 PetscFunctionBegin; 240 if (reuse == MAT_REUSE_MATRIX) { 241 B = *newmat; 242 PetscCall(MatZeroEntries(B)); 243 } else { 244 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 245 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 246 PetscCall(MatSetType(B, MATSEQAIJ)); 247 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 248 } 249 250 for (i = 0; i < A->rmap->n; i++) { 251 PetscInt nz = 0, *cols = NULL; 252 PetscScalar *vals = NULL; 253 254 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 255 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 256 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 257 } 258 259 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 260 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 261 B->rmap->bs = A->rmap->bs; 262 263 if (reuse == MAT_INPLACE_MATRIX) { 264 PetscCall(MatHeaderReplace(A, &B)); 265 } else { 266 *newmat = B; 267 } 268 PetscFunctionReturn(PETSC_SUCCESS); 269 } 270 271 #include <../src/mat/impls/aij/seq/aij.h> 272 273 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 274 { 275 Mat B; 276 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 277 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 278 const PetscInt *cols; 279 const PetscScalar *vals; 280 281 PetscFunctionBegin; 282 if (reuse == MAT_REUSE_MATRIX) { 283 B = *newmat; 284 } else { 285 if (PetscDefined(USE_DEBUG) || !a->ilen) { 286 PetscCall(PetscMalloc1(m, &rowlengths)); 287 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 288 } 289 if (PetscDefined(USE_DEBUG) && a->ilen) { 290 PetscBool eq; 291 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 292 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 293 PetscCall(PetscFree(rowlengths)); 294 rowlengths = a->ilen; 295 } else if (a->ilen) rowlengths = a->ilen; 296 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 297 PetscCall(MatSetSizes(B, m, n, m, n)); 298 PetscCall(MatSetType(B, MATSEQSELL)); 299 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 300 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 301 } 302 303 for (row = 0; row < m; row++) { 304 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 305 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 306 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 307 } 308 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 309 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 310 B->rmap->bs = A->rmap->bs; 311 312 if (reuse == MAT_INPLACE_MATRIX) { 313 PetscCall(MatHeaderReplace(A, &B)); 314 } else { 315 *newmat = B; 316 } 317 PetscFunctionReturn(PETSC_SUCCESS); 318 } 319 320 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 321 { 322 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 323 PetscScalar *y; 324 const PetscScalar *x; 325 const MatScalar *aval = a->val; 326 PetscInt totalslices = a->totalslices; 327 const PetscInt *acolidx = a->colidx; 328 PetscInt i, j; 329 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 330 __m512d vec_x, vec_y, vec_vals; 331 __m256i vec_idx; 332 __mmask8 mask; 333 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 334 __m256i vec_idx2, vec_idx3, vec_idx4; 335 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 336 __m128i vec_idx; 337 __m256d vec_x, vec_y, vec_y2, vec_vals; 338 MatScalar yval; 339 PetscInt r, rows_left, row, nnz_in_row; 340 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 341 __m128d vec_x_tmp; 342 __m256d vec_x, vec_y, vec_y2, vec_vals; 343 MatScalar yval; 344 PetscInt r, rows_left, row, nnz_in_row; 345 #else 346 PetscInt k, sliceheight = a->sliceheight; 347 PetscScalar *sum; 348 #endif 349 350 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 351 #pragma disjoint(*x, *y, *aval) 352 #endif 353 354 PetscFunctionBegin; 355 PetscCall(VecGetArrayRead(xx, &x)); 356 PetscCall(VecGetArray(yy, &y)); 357 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 358 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 359 for (i = 0; i < totalslices; i++) { /* loop over slices */ 360 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 361 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 362 363 vec_y = _mm512_setzero_pd(); 364 vec_y2 = _mm512_setzero_pd(); 365 vec_y3 = _mm512_setzero_pd(); 366 vec_y4 = _mm512_setzero_pd(); 367 368 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 369 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 370 case 3: 371 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 372 acolidx += 8; 373 aval += 8; 374 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 375 acolidx += 8; 376 aval += 8; 377 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 378 acolidx += 8; 379 aval += 8; 380 j += 3; 381 break; 382 case 2: 383 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 384 acolidx += 8; 385 aval += 8; 386 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 387 acolidx += 8; 388 aval += 8; 389 j += 2; 390 break; 391 case 1: 392 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 393 acolidx += 8; 394 aval += 8; 395 j += 1; 396 break; 397 } 398 #pragma novector 399 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 400 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 401 acolidx += 8; 402 aval += 8; 403 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 404 acolidx += 8; 405 aval += 8; 406 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 407 acolidx += 8; 408 aval += 8; 409 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 410 acolidx += 8; 411 aval += 8; 412 } 413 414 vec_y = _mm512_add_pd(vec_y, vec_y2); 415 vec_y = _mm512_add_pd(vec_y, vec_y3); 416 vec_y = _mm512_add_pd(vec_y, vec_y4); 417 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 418 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 419 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 420 } else { 421 _mm512_storeu_pd(&y[8 * i], vec_y); 422 } 423 } 424 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 425 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 426 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 427 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 428 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 429 430 /* last slice may have padding rows. Don't use vectorization. */ 431 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 432 rows_left = A->rmap->n - 8 * i; 433 for (r = 0; r < rows_left; ++r) { 434 yval = (MatScalar)0; 435 row = 8 * i + r; 436 nnz_in_row = a->rlen[row]; 437 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 438 y[row] = yval; 439 } 440 break; 441 } 442 443 vec_y = _mm256_setzero_pd(); 444 vec_y2 = _mm256_setzero_pd(); 445 446 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 447 #pragma novector 448 #pragma unroll(2) 449 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 450 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 451 aval += 4; 452 acolidx += 4; 453 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 454 aval += 4; 455 acolidx += 4; 456 } 457 458 _mm256_storeu_pd(y + i * 8, vec_y); 459 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 460 } 461 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 462 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 463 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 464 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 465 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 466 467 vec_y = _mm256_setzero_pd(); 468 vec_y2 = _mm256_setzero_pd(); 469 470 /* last slice may have padding rows. Don't use vectorization. */ 471 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 472 rows_left = A->rmap->n - 8 * i; 473 for (r = 0; r < rows_left; ++r) { 474 yval = (MatScalar)0; 475 row = 8 * i + r; 476 nnz_in_row = a->rlen[row]; 477 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 478 y[row] = yval; 479 } 480 break; 481 } 482 483 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 484 #pragma novector 485 #pragma unroll(2) 486 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 487 vec_vals = _mm256_loadu_pd(aval); 488 vec_x_tmp = _mm_setzero_pd(); 489 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 490 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 491 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 492 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 493 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 494 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 495 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 496 aval += 4; 497 498 vec_vals = _mm256_loadu_pd(aval); 499 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 500 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 501 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 502 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 503 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 504 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 505 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 506 aval += 4; 507 } 508 509 _mm256_storeu_pd(y + i * 8, vec_y); 510 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 511 } 512 #else 513 PetscCall(PetscMalloc1(sliceheight, &sum)); 514 for (i = 0; i < totalslices; i++) { /* loop over slices */ 515 for (j = 0; j < sliceheight; j++) { 516 sum[j] = 0.0; 517 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 518 } 519 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */ 520 for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j]; 521 } else { 522 for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j]; 523 } 524 } 525 PetscCall(PetscFree(sum)); 526 #endif 527 528 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 529 PetscCall(VecRestoreArrayRead(xx, &x)); 530 PetscCall(VecRestoreArray(yy, &y)); 531 PetscFunctionReturn(PETSC_SUCCESS); 532 } 533 534 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 535 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 536 { 537 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 538 PetscScalar *y, *z; 539 const PetscScalar *x; 540 const MatScalar *aval = a->val; 541 PetscInt totalslices = a->totalslices; 542 const PetscInt *acolidx = a->colidx; 543 PetscInt i, j; 544 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 545 __m512d vec_x, vec_y, vec_vals; 546 __m256i vec_idx; 547 __mmask8 mask = 0; 548 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 549 __m256i vec_idx2, vec_idx3, vec_idx4; 550 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 551 __m128d vec_x_tmp; 552 __m256d vec_x, vec_y, vec_y2, vec_vals; 553 MatScalar yval; 554 PetscInt r, row, nnz_in_row; 555 #else 556 PetscInt k, sliceheight = a->sliceheight; 557 PetscScalar *sum; 558 #endif 559 560 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 561 #pragma disjoint(*x, *y, *aval) 562 #endif 563 564 PetscFunctionBegin; 565 if (!a->nz) { 566 PetscCall(VecCopy(yy, zz)); 567 PetscFunctionReturn(PETSC_SUCCESS); 568 } 569 PetscCall(VecGetArrayRead(xx, &x)); 570 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 571 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 572 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 573 for (i = 0; i < totalslices; i++) { /* loop over slices */ 574 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 575 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 576 577 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 578 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 579 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 580 } else { 581 vec_y = _mm512_loadu_pd(&y[8 * i]); 582 } 583 vec_y2 = _mm512_setzero_pd(); 584 vec_y3 = _mm512_setzero_pd(); 585 vec_y4 = _mm512_setzero_pd(); 586 587 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 588 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 589 case 3: 590 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 591 acolidx += 8; 592 aval += 8; 593 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 594 acolidx += 8; 595 aval += 8; 596 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 597 acolidx += 8; 598 aval += 8; 599 j += 3; 600 break; 601 case 2: 602 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 603 acolidx += 8; 604 aval += 8; 605 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 606 acolidx += 8; 607 aval += 8; 608 j += 2; 609 break; 610 case 1: 611 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 612 acolidx += 8; 613 aval += 8; 614 j += 1; 615 break; 616 } 617 #pragma novector 618 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 619 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 620 acolidx += 8; 621 aval += 8; 622 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 623 acolidx += 8; 624 aval += 8; 625 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 626 acolidx += 8; 627 aval += 8; 628 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 629 acolidx += 8; 630 aval += 8; 631 } 632 633 vec_y = _mm512_add_pd(vec_y, vec_y2); 634 vec_y = _mm512_add_pd(vec_y, vec_y3); 635 vec_y = _mm512_add_pd(vec_y, vec_y4); 636 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 637 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 638 } else { 639 _mm512_storeu_pd(&z[8 * i], vec_y); 640 } 641 } 642 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 643 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 644 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 645 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 646 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 647 648 /* last slice may have padding rows. Don't use vectorization. */ 649 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 650 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 651 row = 8 * i + r; 652 yval = (MatScalar)0.0; 653 nnz_in_row = a->rlen[row]; 654 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 655 z[row] = y[row] + yval; 656 } 657 break; 658 } 659 660 vec_y = _mm256_loadu_pd(y + 8 * i); 661 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 662 663 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 664 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 665 vec_vals = _mm256_loadu_pd(aval); 666 vec_x_tmp = _mm_setzero_pd(); 667 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 668 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 669 vec_x = _mm256_setzero_pd(); 670 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 671 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 672 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 673 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 674 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 675 aval += 4; 676 677 vec_vals = _mm256_loadu_pd(aval); 678 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 679 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 680 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 681 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 682 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 683 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 684 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 685 aval += 4; 686 } 687 688 _mm256_storeu_pd(z + i * 8, vec_y); 689 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 690 } 691 #else 692 PetscCall(PetscMalloc1(sliceheight, &sum)); 693 for (i = 0; i < totalslices; i++) { /* loop over slices */ 694 for (j = 0; j < sliceheight; j++) { 695 sum[j] = 0.0; 696 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 697 } 698 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 699 for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 700 } else { 701 for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 702 } 703 } 704 PetscCall(PetscFree(sum)); 705 #endif 706 707 PetscCall(PetscLogFlops(2.0 * a->nz)); 708 PetscCall(VecRestoreArrayRead(xx, &x)); 709 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 710 PetscFunctionReturn(PETSC_SUCCESS); 711 } 712 713 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 714 { 715 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 716 PetscScalar *y; 717 const PetscScalar *x; 718 const MatScalar *aval = a->val; 719 const PetscInt *acolidx = a->colidx; 720 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight; 721 722 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 723 #pragma disjoint(*x, *y, *aval) 724 #endif 725 726 PetscFunctionBegin; 727 if (A->symmetric == PETSC_BOOL3_TRUE) { 728 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 729 PetscFunctionReturn(PETSC_SUCCESS); 730 } 731 if (zz != yy) PetscCall(VecCopy(zz, yy)); 732 733 if (a->nz) { 734 PetscCall(VecGetArrayRead(xx, &x)); 735 PetscCall(VecGetArray(yy, &y)); 736 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 737 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 738 for (r = 0; r < (A->rmap->n % sliceheight); ++r) { 739 row = sliceheight * i + r; 740 nnz_in_row = a->rlen[row]; 741 for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row]; 742 } 743 break; 744 } 745 for (r = 0; r < sliceheight; ++r) 746 for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r]; 747 } 748 PetscCall(PetscLogFlops(2.0 * a->nz)); 749 PetscCall(VecRestoreArrayRead(xx, &x)); 750 PetscCall(VecRestoreArray(yy, &y)); 751 } 752 PetscFunctionReturn(PETSC_SUCCESS); 753 } 754 755 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 756 { 757 PetscFunctionBegin; 758 if (A->symmetric == PETSC_BOOL3_TRUE) { 759 PetscCall(MatMult_SeqSELL(A, xx, yy)); 760 } else { 761 PetscCall(VecSet(yy, 0.0)); 762 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 763 } 764 PetscFunctionReturn(PETSC_SUCCESS); 765 } 766 767 /* 768 Checks for missing diagonals 769 */ 770 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 771 { 772 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 773 PetscInt *diag, i; 774 775 PetscFunctionBegin; 776 *missing = PETSC_FALSE; 777 if (A->rmap->n > 0 && !a->colidx) { 778 *missing = PETSC_TRUE; 779 if (d) *d = 0; 780 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 781 } else { 782 diag = a->diag; 783 for (i = 0; i < A->rmap->n; i++) { 784 if (diag[i] == -1) { 785 *missing = PETSC_TRUE; 786 if (d) *d = i; 787 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 788 break; 789 } 790 } 791 } 792 PetscFunctionReturn(PETSC_SUCCESS); 793 } 794 795 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 796 { 797 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 798 PetscInt i, j, m = A->rmap->n, shift; 799 800 PetscFunctionBegin; 801 if (!a->diag) { 802 PetscCall(PetscMalloc1(m, &a->diag)); 803 a->free_diag = PETSC_TRUE; 804 } 805 for (i = 0; i < m; i++) { /* loop over rows */ 806 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 807 a->diag[i] = -1; 808 for (j = 0; j < a->rlen[i]; j++) { 809 if (a->colidx[shift + a->sliceheight * j] == i) { 810 a->diag[i] = shift + a->sliceheight * j; 811 break; 812 } 813 } 814 } 815 PetscFunctionReturn(PETSC_SUCCESS); 816 } 817 818 /* 819 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 820 */ 821 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 822 { 823 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 824 PetscInt i, *diag, m = A->rmap->n; 825 MatScalar *val = a->val; 826 PetscScalar *idiag, *mdiag; 827 828 PetscFunctionBegin; 829 if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS); 830 PetscCall(MatMarkDiagonal_SeqSELL(A)); 831 diag = a->diag; 832 if (!a->idiag) { 833 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 834 val = a->val; 835 } 836 mdiag = a->mdiag; 837 idiag = a->idiag; 838 839 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 840 for (i = 0; i < m; i++) { 841 mdiag[i] = val[diag[i]]; 842 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 843 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 844 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 845 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 846 A->factorerror_zeropivot_value = 0.0; 847 A->factorerror_zeropivot_row = i; 848 } 849 idiag[i] = 1.0 / val[diag[i]]; 850 } 851 PetscCall(PetscLogFlops(m)); 852 } else { 853 for (i = 0; i < m; i++) { 854 mdiag[i] = val[diag[i]]; 855 idiag[i] = omega / (fshift + val[diag[i]]); 856 } 857 PetscCall(PetscLogFlops(2.0 * m)); 858 } 859 a->idiagvalid = PETSC_TRUE; 860 PetscFunctionReturn(PETSC_SUCCESS); 861 } 862 863 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 864 { 865 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 866 867 PetscFunctionBegin; 868 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 869 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 870 PetscFunctionReturn(PETSC_SUCCESS); 871 } 872 873 PetscErrorCode MatDestroy_SeqSELL(Mat A) 874 { 875 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 876 877 PetscFunctionBegin; 878 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 879 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 880 PetscCall(ISDestroy(&a->row)); 881 PetscCall(ISDestroy(&a->col)); 882 PetscCall(PetscFree(a->diag)); 883 PetscCall(PetscFree(a->rlen)); 884 PetscCall(PetscFree(a->sliidx)); 885 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 886 PetscCall(PetscFree(a->solve_work)); 887 PetscCall(ISDestroy(&a->icol)); 888 PetscCall(PetscFree(a->saved_values)); 889 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 890 PetscCall(PetscFree(A->data)); 891 #if defined(PETSC_HAVE_CUPM) 892 PetscCall(PetscFree(a->chunk_slice_map)); 893 #endif 894 895 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 896 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 897 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 898 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 899 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 900 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 901 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 902 #if defined(PETSC_HAVE_CUDA) 903 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL)); 904 #endif 905 #if defined(PETSC_HAVE_HIP) 906 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellhip_C", NULL)); 907 #endif 908 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL)); 909 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL)); 910 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL)); 911 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL)); 912 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL)); 913 PetscFunctionReturn(PETSC_SUCCESS); 914 } 915 916 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 917 { 918 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 919 920 PetscFunctionBegin; 921 switch (op) { 922 case MAT_ROW_ORIENTED: 923 a->roworiented = flg; 924 break; 925 case MAT_KEEP_NONZERO_PATTERN: 926 a->keepnonzeropattern = flg; 927 break; 928 case MAT_NEW_NONZERO_LOCATIONS: 929 a->nonew = (flg ? 0 : 1); 930 break; 931 case MAT_NEW_NONZERO_LOCATION_ERR: 932 a->nonew = (flg ? -1 : 0); 933 break; 934 case MAT_NEW_NONZERO_ALLOCATION_ERR: 935 a->nonew = (flg ? -2 : 0); 936 break; 937 case MAT_UNUSED_NONZERO_LOCATION_ERR: 938 a->nounused = (flg ? -1 : 0); 939 break; 940 case MAT_FORCE_DIAGONAL_ENTRIES: 941 case MAT_IGNORE_OFF_PROC_ENTRIES: 942 case MAT_USE_HASH_TABLE: 943 case MAT_SORTED_FULL: 944 PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); 945 break; 946 case MAT_SPD: 947 case MAT_SYMMETRIC: 948 case MAT_STRUCTURALLY_SYMMETRIC: 949 case MAT_HERMITIAN: 950 case MAT_SYMMETRY_ETERNAL: 951 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 952 case MAT_SPD_ETERNAL: 953 /* These options are handled directly by MatSetOption() */ 954 break; 955 default: 956 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 957 } 958 PetscFunctionReturn(PETSC_SUCCESS); 959 } 960 961 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 962 { 963 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 964 PetscInt i, j, n, shift; 965 PetscScalar *x, zero = 0.0; 966 967 PetscFunctionBegin; 968 PetscCall(VecGetLocalSize(v, &n)); 969 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 970 971 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 972 PetscInt *diag = a->diag; 973 PetscCall(VecGetArray(v, &x)); 974 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 975 PetscCall(VecRestoreArray(v, &x)); 976 PetscFunctionReturn(PETSC_SUCCESS); 977 } 978 979 PetscCall(VecSet(v, zero)); 980 PetscCall(VecGetArray(v, &x)); 981 for (i = 0; i < n; i++) { /* loop over rows */ 982 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 983 x[i] = 0; 984 for (j = 0; j < a->rlen[i]; j++) { 985 if (a->colidx[shift + a->sliceheight * j] == i) { 986 x[i] = a->val[shift + a->sliceheight * j]; 987 break; 988 } 989 } 990 } 991 PetscCall(VecRestoreArray(v, &x)); 992 PetscFunctionReturn(PETSC_SUCCESS); 993 } 994 995 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 996 { 997 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 998 const PetscScalar *l, *r; 999 PetscInt i, j, m, n, row; 1000 1001 PetscFunctionBegin; 1002 if (ll) { 1003 /* The local size is used so that VecMPI can be passed to this routine 1004 by MatDiagonalScale_MPISELL */ 1005 PetscCall(VecGetLocalSize(ll, &m)); 1006 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1007 PetscCall(VecGetArrayRead(ll, &l)); 1008 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1009 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1010 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1011 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row]; 1012 } 1013 } else { 1014 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; } 1015 } 1016 } 1017 PetscCall(VecRestoreArrayRead(ll, &l)); 1018 PetscCall(PetscLogFlops(a->nz)); 1019 } 1020 if (rr) { 1021 PetscCall(VecGetLocalSize(rr, &n)); 1022 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1023 PetscCall(VecGetArrayRead(rr, &r)); 1024 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1025 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1026 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) { 1027 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]]; 1028 } 1029 } else { 1030 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 1031 } 1032 } 1033 PetscCall(VecRestoreArrayRead(rr, &r)); 1034 PetscCall(PetscLogFlops(a->nz)); 1035 } 1036 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1037 #if defined(PETSC_HAVE_CUPM) 1038 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1039 #endif 1040 PetscFunctionReturn(PETSC_SUCCESS); 1041 } 1042 1043 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1044 { 1045 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1046 PetscInt *cp, i, k, low, high, t, row, col, l; 1047 PetscInt shift; 1048 MatScalar *vp; 1049 1050 PetscFunctionBegin; 1051 for (k = 0; k < m; k++) { /* loop over requested rows */ 1052 row = im[k]; 1053 if (row < 0) continue; 1054 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1055 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */ 1056 cp = a->colidx + shift; /* pointer to the row */ 1057 vp = a->val + shift; /* pointer to the row */ 1058 for (l = 0; l < n; l++) { /* loop over requested columns */ 1059 col = in[l]; 1060 if (col < 0) continue; 1061 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1062 high = a->rlen[row]; 1063 low = 0; /* assume unsorted */ 1064 while (high - low > 5) { 1065 t = (low + high) / 2; 1066 if (*(cp + a->sliceheight * t) > col) high = t; 1067 else low = t; 1068 } 1069 for (i = low; i < high; i++) { 1070 if (*(cp + a->sliceheight * i) > col) break; 1071 if (*(cp + a->sliceheight * i) == col) { 1072 *v++ = *(vp + a->sliceheight * i); 1073 goto finished; 1074 } 1075 } 1076 *v++ = 0.0; 1077 finished:; 1078 } 1079 } 1080 PetscFunctionReturn(PETSC_SUCCESS); 1081 } 1082 1083 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1084 { 1085 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1086 PetscInt i, j, m = A->rmap->n, shift; 1087 const char *name; 1088 PetscViewerFormat format; 1089 1090 PetscFunctionBegin; 1091 PetscCall(PetscViewerGetFormat(viewer, &format)); 1092 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1093 PetscInt nofinalvalue = 0; 1094 /* 1095 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1096 nofinalvalue = 1; 1097 } 1098 */ 1099 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1100 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1101 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1102 #if defined(PETSC_USE_COMPLEX) 1103 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1104 #else 1105 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1106 #endif 1107 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1108 1109 for (i = 0; i < m; i++) { 1110 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1111 for (j = 0; j < a->rlen[i]; j++) { 1112 #if defined(PETSC_USE_COMPLEX) 1113 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1114 #else 1115 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j])); 1116 #endif 1117 } 1118 } 1119 /* 1120 if (nofinalvalue) { 1121 #if defined(PETSC_USE_COMPLEX) 1122 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1123 #else 1124 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1125 #endif 1126 } 1127 */ 1128 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1129 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1130 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1131 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1132 PetscFunctionReturn(PETSC_SUCCESS); 1133 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1134 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1135 for (i = 0; i < m; i++) { 1136 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1137 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1138 for (j = 0; j < a->rlen[i]; j++) { 1139 #if defined(PETSC_USE_COMPLEX) 1140 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1141 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1142 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1143 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1144 } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1145 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1146 } 1147 #else 1148 if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1149 #endif 1150 } 1151 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1152 } 1153 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1154 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1155 PetscInt cnt = 0, jcnt; 1156 PetscScalar value; 1157 #if defined(PETSC_USE_COMPLEX) 1158 PetscBool realonly = PETSC_TRUE; 1159 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1160 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1161 realonly = PETSC_FALSE; 1162 break; 1163 } 1164 } 1165 #endif 1166 1167 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1168 for (i = 0; i < m; i++) { 1169 jcnt = 0; 1170 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1171 for (j = 0; j < A->cmap->n; j++) { 1172 if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) { 1173 value = a->val[cnt++]; 1174 jcnt++; 1175 } else { 1176 value = 0.0; 1177 } 1178 #if defined(PETSC_USE_COMPLEX) 1179 if (realonly) { 1180 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1181 } else { 1182 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1183 } 1184 #else 1185 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1186 #endif 1187 } 1188 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1189 } 1190 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1191 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1192 PetscInt fshift = 1; 1193 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1194 #if defined(PETSC_USE_COMPLEX) 1195 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1196 #else 1197 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1198 #endif 1199 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1200 for (i = 0; i < m; i++) { 1201 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1202 for (j = 0; j < a->rlen[i]; j++) { 1203 #if defined(PETSC_USE_COMPLEX) 1204 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1205 #else 1206 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j])); 1207 #endif 1208 } 1209 } 1210 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1211 } else if (format == PETSC_VIEWER_NATIVE) { 1212 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1213 PetscInt row; 1214 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1215 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1216 #if defined(PETSC_USE_COMPLEX) 1217 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1218 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1219 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1220 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1221 } else { 1222 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1223 } 1224 #else 1225 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j])); 1226 #endif 1227 } 1228 } 1229 } else { 1230 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1231 if (A->factortype) { 1232 for (i = 0; i < m; i++) { 1233 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1234 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1235 /* L part */ 1236 for (j = shift; j < a->diag[i]; j += a->sliceheight) { 1237 #if defined(PETSC_USE_COMPLEX) 1238 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) { 1239 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1240 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) { 1241 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1242 } else { 1243 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1244 } 1245 #else 1246 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1247 #endif 1248 } 1249 /* diagonal */ 1250 j = a->diag[i]; 1251 #if defined(PETSC_USE_COMPLEX) 1252 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1253 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1254 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1255 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1256 } else { 1257 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1258 } 1259 #else 1260 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1 / a->val[j]))); 1261 #endif 1262 1263 /* U part */ 1264 for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) { 1265 #if defined(PETSC_USE_COMPLEX) 1266 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1267 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1268 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1269 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1270 } else { 1271 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1272 } 1273 #else 1274 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1275 #endif 1276 } 1277 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1278 } 1279 } else { 1280 for (i = 0; i < m; i++) { 1281 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1282 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1283 for (j = 0; j < a->rlen[i]; j++) { 1284 #if defined(PETSC_USE_COMPLEX) 1285 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1286 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1287 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1288 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1289 } else { 1290 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1291 } 1292 #else 1293 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1294 #endif 1295 } 1296 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1297 } 1298 } 1299 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1300 } 1301 PetscCall(PetscViewerFlush(viewer)); 1302 PetscFunctionReturn(PETSC_SUCCESS); 1303 } 1304 1305 #include <petscdraw.h> 1306 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1307 { 1308 Mat A = (Mat)Aa; 1309 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1310 PetscInt i, j, m = A->rmap->n, shift; 1311 int color; 1312 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1313 PetscViewer viewer; 1314 PetscViewerFormat format; 1315 1316 PetscFunctionBegin; 1317 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1318 PetscCall(PetscViewerGetFormat(viewer, &format)); 1319 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1320 1321 /* loop over matrix elements drawing boxes */ 1322 1323 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1324 PetscDrawCollectiveBegin(draw); 1325 /* Blue for negative, Cyan for zero and Red for positive */ 1326 color = PETSC_DRAW_BLUE; 1327 for (i = 0; i < m; i++) { 1328 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1329 y_l = m - i - 1.0; 1330 y_r = y_l + 1.0; 1331 for (j = 0; j < a->rlen[i]; j++) { 1332 x_l = a->colidx[shift + a->sliceheight * j]; 1333 x_r = x_l + 1.0; 1334 if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue; 1335 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1336 } 1337 } 1338 color = PETSC_DRAW_CYAN; 1339 for (i = 0; i < m; i++) { 1340 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1341 y_l = m - i - 1.0; 1342 y_r = y_l + 1.0; 1343 for (j = 0; j < a->rlen[i]; j++) { 1344 x_l = a->colidx[shift + a->sliceheight * j]; 1345 x_r = x_l + 1.0; 1346 if (a->val[shift + a->sliceheight * j] != 0.) continue; 1347 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1348 } 1349 } 1350 color = PETSC_DRAW_RED; 1351 for (i = 0; i < m; i++) { 1352 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1353 y_l = m - i - 1.0; 1354 y_r = y_l + 1.0; 1355 for (j = 0; j < a->rlen[i]; j++) { 1356 x_l = a->colidx[shift + a->sliceheight * j]; 1357 x_r = x_l + 1.0; 1358 if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue; 1359 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1360 } 1361 } 1362 PetscDrawCollectiveEnd(draw); 1363 } else { 1364 /* use contour shading to indicate magnitude of values */ 1365 /* first determine max of all nonzero values */ 1366 PetscReal minv = 0.0, maxv = 0.0; 1367 PetscInt count = 0; 1368 PetscDraw popup; 1369 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1370 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1371 } 1372 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1373 PetscCall(PetscDrawGetPopup(draw, &popup)); 1374 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1375 1376 PetscDrawCollectiveBegin(draw); 1377 for (i = 0; i < m; i++) { 1378 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1379 y_l = m - i - 1.0; 1380 y_r = y_l + 1.0; 1381 for (j = 0; j < a->rlen[i]; j++) { 1382 x_l = a->colidx[shift + a->sliceheight * j]; 1383 x_r = x_l + 1.0; 1384 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1385 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1386 count++; 1387 } 1388 } 1389 PetscDrawCollectiveEnd(draw); 1390 } 1391 PetscFunctionReturn(PETSC_SUCCESS); 1392 } 1393 1394 #include <petscdraw.h> 1395 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1396 { 1397 PetscDraw draw; 1398 PetscReal xr, yr, xl, yl, h, w; 1399 PetscBool isnull; 1400 1401 PetscFunctionBegin; 1402 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1403 PetscCall(PetscDrawIsNull(draw, &isnull)); 1404 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1405 1406 xr = A->cmap->n; 1407 yr = A->rmap->n; 1408 h = yr / 10.0; 1409 w = xr / 10.0; 1410 xr += w; 1411 yr += h; 1412 xl = -w; 1413 yl = -h; 1414 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1415 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1416 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1417 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1418 PetscCall(PetscDrawSave(draw)); 1419 PetscFunctionReturn(PETSC_SUCCESS); 1420 } 1421 1422 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1423 { 1424 PetscBool iascii, isbinary, isdraw; 1425 1426 PetscFunctionBegin; 1427 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1428 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1429 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1430 if (iascii) { 1431 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1432 } else if (isbinary) { 1433 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1434 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1435 PetscFunctionReturn(PETSC_SUCCESS); 1436 } 1437 1438 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1439 { 1440 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1441 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1442 MatScalar *vp; 1443 #if defined(PETSC_HAVE_CUPM) 1444 PetscInt totalchunks = 0; 1445 #endif 1446 1447 PetscFunctionBegin; 1448 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1449 /* To do: compress out the unused elements */ 1450 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1451 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1452 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1453 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1454 a->nonzerorowcnt = 0; 1455 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1456 for (i = 0; i < a->totalslices; ++i) { 1457 shift = a->sliidx[i]; /* starting index of the slice */ 1458 cp = PetscSafePointerPlusOffset(a->colidx, shift); /* pointer to the column indices of the slice */ 1459 vp = PetscSafePointerPlusOffset(a->val, shift); /* pointer to the nonzero values of the slice */ 1460 for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */ 1461 row = a->sliceheight * i + row_in_slice; 1462 nrow = a->rlen[row]; /* number of nonzeros in row */ 1463 /* 1464 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1465 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1466 */ 1467 lastcol = 0; 1468 if (nrow > 0) { /* nonempty row */ 1469 a->nonzerorowcnt++; 1470 lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1471 } else if (!row_in_slice) { /* first row of the correct slice is empty */ 1472 for (j = 1; j < a->sliceheight; j++) { 1473 if (a->rlen[a->sliceheight * i + j]) { 1474 lastcol = cp[j]; 1475 break; 1476 } 1477 } 1478 } else { 1479 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1480 } 1481 1482 for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) { 1483 cp[a->sliceheight * k + row_in_slice] = lastcol; 1484 vp[a->sliceheight * k + row_in_slice] = (MatScalar)0; 1485 } 1486 } 1487 } 1488 1489 A->info.mallocs += a->reallocs; 1490 a->reallocs = 0; 1491 1492 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1493 #if defined(PETSC_HAVE_CUPM) 1494 if (!a->chunksize && a->totalslices) { 1495 a->chunksize = 64; 1496 while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2; 1497 totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize; 1498 } 1499 if (totalchunks != a->totalchunks) { 1500 PetscCall(PetscFree(a->chunk_slice_map)); 1501 PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map)); 1502 a->totalchunks = totalchunks; 1503 } 1504 j = 0; 1505 for (i = 0; i < totalchunks; i++) { 1506 while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++; 1507 a->chunk_slice_map[i] = j; 1508 } 1509 #endif 1510 PetscFunctionReturn(PETSC_SUCCESS); 1511 } 1512 1513 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1514 { 1515 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1516 1517 PetscFunctionBegin; 1518 info->block_size = 1.0; 1519 info->nz_allocated = a->maxallocmat; 1520 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1521 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1522 info->assemblies = A->num_ass; 1523 info->mallocs = A->info.mallocs; 1524 info->memory = 0; /* REVIEW ME */ 1525 if (A->factortype) { 1526 info->fill_ratio_given = A->info.fill_ratio_given; 1527 info->fill_ratio_needed = A->info.fill_ratio_needed; 1528 info->factor_mallocs = A->info.factor_mallocs; 1529 } else { 1530 info->fill_ratio_given = 0; 1531 info->fill_ratio_needed = 0; 1532 info->factor_mallocs = 0; 1533 } 1534 PetscFunctionReturn(PETSC_SUCCESS); 1535 } 1536 1537 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1538 { 1539 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1540 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1541 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1542 MatScalar *vp, value; 1543 #if defined(PETSC_HAVE_CUPM) 1544 PetscBool inserted = PETSC_FALSE; 1545 PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight; 1546 #endif 1547 1548 PetscFunctionBegin; 1549 for (k = 0; k < m; k++) { /* loop over added rows */ 1550 row = im[k]; 1551 if (row < 0) continue; 1552 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1553 shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */ 1554 cp = a->colidx + shift; /* pointer to the row */ 1555 vp = a->val + shift; /* pointer to the row */ 1556 nrow = a->rlen[row]; 1557 low = 0; 1558 high = nrow; 1559 1560 for (l = 0; l < n; l++) { /* loop over added columns */ 1561 col = in[l]; 1562 if (col < 0) continue; 1563 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1564 if (a->roworiented) { 1565 value = v[l + k * n]; 1566 } else { 1567 value = v[k + l * m]; 1568 } 1569 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1570 1571 /* search in this row for the specified column, i indicates the column to be set */ 1572 if (col <= lastcol) low = 0; 1573 else high = nrow; 1574 lastcol = col; 1575 while (high - low > 5) { 1576 t = (low + high) / 2; 1577 if (*(cp + a->sliceheight * t) > col) high = t; 1578 else low = t; 1579 } 1580 for (i = low; i < high; i++) { 1581 if (*(cp + a->sliceheight * i) > col) break; 1582 if (*(cp + a->sliceheight * i) == col) { 1583 if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value; 1584 else *(vp + a->sliceheight * i) = value; 1585 #if defined(PETSC_HAVE_CUPM) 1586 inserted = PETSC_TRUE; 1587 #endif 1588 low = i + 1; 1589 goto noinsert; 1590 } 1591 } 1592 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1593 if (nonew == 1) goto noinsert; 1594 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1595 #if defined(PETSC_HAVE_CUPM) 1596 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul); 1597 #else 1598 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1599 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1); 1600 #endif 1601 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1602 for (ii = nrow - 1; ii >= i; ii--) { 1603 *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); 1604 *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); 1605 } 1606 a->rlen[row]++; 1607 *(cp + a->sliceheight * i) = col; 1608 *(vp + a->sliceheight * i) = value; 1609 a->nz++; 1610 #if defined(PETSC_HAVE_CUPM) 1611 inserted = PETSC_TRUE; 1612 #endif 1613 low = i + 1; 1614 high++; 1615 nrow++; 1616 noinsert:; 1617 } 1618 a->rlen[row] = nrow; 1619 } 1620 #if defined(PETSC_HAVE_CUPM) 1621 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU; 1622 #endif 1623 PetscFunctionReturn(PETSC_SUCCESS); 1624 } 1625 1626 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1627 { 1628 PetscFunctionBegin; 1629 /* If the two matrices have the same copy implementation, use fast copy. */ 1630 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1631 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1632 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1633 1634 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1635 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1636 } else { 1637 PetscCall(MatCopy_Basic(A, B, str)); 1638 } 1639 PetscFunctionReturn(PETSC_SUCCESS); 1640 } 1641 1642 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1643 { 1644 PetscFunctionBegin; 1645 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1646 PetscFunctionReturn(PETSC_SUCCESS); 1647 } 1648 1649 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1650 { 1651 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1652 1653 PetscFunctionBegin; 1654 *array = a->val; 1655 PetscFunctionReturn(PETSC_SUCCESS); 1656 } 1657 1658 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1659 { 1660 PetscFunctionBegin; 1661 PetscFunctionReturn(PETSC_SUCCESS); 1662 } 1663 1664 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1665 { 1666 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1667 MatScalar *aval = a->val; 1668 PetscScalar oalpha = alpha; 1669 PetscBLASInt one = 1, size; 1670 1671 PetscFunctionBegin; 1672 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1673 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1674 PetscCall(PetscLogFlops(a->nz)); 1675 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1676 #if defined(PETSC_HAVE_CUPM) 1677 if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU; 1678 #endif 1679 PetscFunctionReturn(PETSC_SUCCESS); 1680 } 1681 1682 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1683 { 1684 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1685 1686 PetscFunctionBegin; 1687 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1688 PetscCall(MatShift_Basic(Y, a)); 1689 PetscFunctionReturn(PETSC_SUCCESS); 1690 } 1691 1692 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1693 { 1694 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1695 PetscScalar *x, sum, *t; 1696 const MatScalar *idiag = NULL, *mdiag; 1697 const PetscScalar *b, *xb; 1698 PetscInt n, m = A->rmap->n, i, j, shift; 1699 const PetscInt *diag; 1700 1701 PetscFunctionBegin; 1702 its = its * lits; 1703 1704 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1705 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1706 a->fshift = fshift; 1707 a->omega = omega; 1708 1709 diag = a->diag; 1710 t = a->ssor_work; 1711 idiag = a->idiag; 1712 mdiag = a->mdiag; 1713 1714 PetscCall(VecGetArray(xx, &x)); 1715 PetscCall(VecGetArrayRead(bb, &b)); 1716 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1717 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1718 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1719 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1720 1721 if (flag & SOR_ZERO_INITIAL_GUESS) { 1722 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1723 for (i = 0; i < m; i++) { 1724 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1725 sum = b[i]; 1726 n = (diag[i] - shift) / a->sliceheight; 1727 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1728 t[i] = sum; 1729 x[i] = sum * idiag[i]; 1730 } 1731 xb = t; 1732 PetscCall(PetscLogFlops(a->nz)); 1733 } else xb = b; 1734 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1735 for (i = m - 1; i >= 0; i--) { 1736 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1737 sum = xb[i]; 1738 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1739 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1740 if (xb == b) { 1741 x[i] = sum * idiag[i]; 1742 } else { 1743 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1744 } 1745 } 1746 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1747 } 1748 its--; 1749 } 1750 while (its--) { 1751 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1752 for (i = 0; i < m; i++) { 1753 /* lower */ 1754 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1755 sum = b[i]; 1756 n = (diag[i] - shift) / a->sliceheight; 1757 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1758 t[i] = sum; /* save application of the lower-triangular part */ 1759 /* upper */ 1760 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1761 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1762 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1763 } 1764 xb = t; 1765 PetscCall(PetscLogFlops(2.0 * a->nz)); 1766 } else xb = b; 1767 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1768 for (i = m - 1; i >= 0; i--) { 1769 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1770 sum = xb[i]; 1771 if (xb == b) { 1772 /* whole matrix (no checkpointing available) */ 1773 n = a->rlen[i]; 1774 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1775 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1776 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1777 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1778 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1779 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1780 } 1781 } 1782 if (xb == b) { 1783 PetscCall(PetscLogFlops(2.0 * a->nz)); 1784 } else { 1785 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1786 } 1787 } 1788 } 1789 PetscCall(VecRestoreArray(xx, &x)); 1790 PetscCall(VecRestoreArrayRead(bb, &b)); 1791 PetscFunctionReturn(PETSC_SUCCESS); 1792 } 1793 1794 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1795 MatGetRow_SeqSELL, 1796 MatRestoreRow_SeqSELL, 1797 MatMult_SeqSELL, 1798 /* 4*/ MatMultAdd_SeqSELL, 1799 MatMultTranspose_SeqSELL, 1800 MatMultTransposeAdd_SeqSELL, 1801 NULL, 1802 NULL, 1803 NULL, 1804 /* 10*/ NULL, 1805 NULL, 1806 NULL, 1807 MatSOR_SeqSELL, 1808 NULL, 1809 /* 15*/ MatGetInfo_SeqSELL, 1810 MatEqual_SeqSELL, 1811 MatGetDiagonal_SeqSELL, 1812 MatDiagonalScale_SeqSELL, 1813 NULL, 1814 /* 20*/ NULL, 1815 MatAssemblyEnd_SeqSELL, 1816 MatSetOption_SeqSELL, 1817 MatZeroEntries_SeqSELL, 1818 /* 24*/ NULL, 1819 NULL, 1820 NULL, 1821 NULL, 1822 NULL, 1823 /* 29*/ MatSetUp_SeqSELL, 1824 NULL, 1825 NULL, 1826 NULL, 1827 NULL, 1828 /* 34*/ MatDuplicate_SeqSELL, 1829 NULL, 1830 NULL, 1831 NULL, 1832 NULL, 1833 /* 39*/ NULL, 1834 NULL, 1835 NULL, 1836 MatGetValues_SeqSELL, 1837 MatCopy_SeqSELL, 1838 /* 44*/ NULL, 1839 MatScale_SeqSELL, 1840 MatShift_SeqSELL, 1841 NULL, 1842 NULL, 1843 /* 49*/ NULL, 1844 NULL, 1845 NULL, 1846 NULL, 1847 NULL, 1848 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1849 NULL, 1850 NULL, 1851 NULL, 1852 NULL, 1853 /* 59*/ NULL, 1854 MatDestroy_SeqSELL, 1855 MatView_SeqSELL, 1856 NULL, 1857 NULL, 1858 /* 64*/ NULL, 1859 NULL, 1860 NULL, 1861 NULL, 1862 NULL, 1863 /* 69*/ NULL, 1864 NULL, 1865 NULL, 1866 NULL, 1867 NULL, 1868 /* 74*/ NULL, 1869 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1870 NULL, 1871 NULL, 1872 NULL, 1873 /* 79*/ NULL, 1874 NULL, 1875 NULL, 1876 NULL, 1877 NULL, 1878 /* 84*/ NULL, 1879 NULL, 1880 NULL, 1881 NULL, 1882 NULL, 1883 /* 89*/ NULL, 1884 NULL, 1885 NULL, 1886 NULL, 1887 NULL, 1888 /* 94*/ NULL, 1889 NULL, 1890 NULL, 1891 NULL, 1892 NULL, 1893 /* 99*/ NULL, 1894 NULL, 1895 NULL, 1896 MatConjugate_SeqSELL, 1897 NULL, 1898 /*104*/ NULL, 1899 NULL, 1900 NULL, 1901 NULL, 1902 NULL, 1903 /*109*/ NULL, 1904 NULL, 1905 NULL, 1906 NULL, 1907 MatMissingDiagonal_SeqSELL, 1908 /*114*/ NULL, 1909 NULL, 1910 NULL, 1911 NULL, 1912 NULL, 1913 /*119*/ NULL, 1914 NULL, 1915 NULL, 1916 NULL, 1917 NULL, 1918 /*124*/ NULL, 1919 NULL, 1920 NULL, 1921 NULL, 1922 NULL, 1923 /*129*/ NULL, 1924 NULL, 1925 NULL, 1926 NULL, 1927 NULL, 1928 /*134*/ NULL, 1929 NULL, 1930 NULL, 1931 NULL, 1932 NULL, 1933 /*139*/ NULL, 1934 NULL, 1935 NULL, 1936 MatFDColoringSetUp_SeqXAIJ, 1937 NULL, 1938 /*144*/ NULL, 1939 NULL, 1940 NULL, 1941 NULL, 1942 NULL, 1943 NULL, 1944 /*150*/ NULL, 1945 NULL, 1946 NULL, 1947 NULL, 1948 NULL, 1949 /*155*/ NULL, 1950 NULL}; 1951 1952 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1953 { 1954 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1955 1956 PetscFunctionBegin; 1957 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1958 1959 /* allocate space for values if not already there */ 1960 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1961 1962 /* copy values over */ 1963 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1964 PetscFunctionReturn(PETSC_SUCCESS); 1965 } 1966 1967 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1968 { 1969 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1970 1971 PetscFunctionBegin; 1972 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1973 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1974 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1975 PetscFunctionReturn(PETSC_SUCCESS); 1976 } 1977 1978 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio) 1979 { 1980 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1981 1982 PetscFunctionBegin; 1983 if (a->totalslices && a->sliidx[a->totalslices]) { 1984 *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices]; 1985 } else { 1986 *ratio = 0.0; 1987 } 1988 PetscFunctionReturn(PETSC_SUCCESS); 1989 } 1990 1991 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth) 1992 { 1993 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1994 PetscInt i, current_slicewidth; 1995 1996 PetscFunctionBegin; 1997 *slicewidth = 0; 1998 for (i = 0; i < a->totalslices; i++) { 1999 current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight; 2000 if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth; 2001 } 2002 PetscFunctionReturn(PETSC_SUCCESS); 2003 } 2004 2005 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth) 2006 { 2007 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2008 2009 PetscFunctionBegin; 2010 *slicewidth = 0; 2011 if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; } 2012 PetscFunctionReturn(PETSC_SUCCESS); 2013 } 2014 2015 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance) 2016 { 2017 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2018 PetscReal mean; 2019 PetscInt i, totalslices = a->totalslices, *sliidx = a->sliidx; 2020 2021 PetscFunctionBegin; 2022 *variance = 0; 2023 if (totalslices) { 2024 mean = (PetscReal)sliidx[totalslices] / totalslices; 2025 for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; } 2026 } 2027 PetscFunctionReturn(PETSC_SUCCESS); 2028 } 2029 2030 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight) 2031 { 2032 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2033 2034 PetscFunctionBegin; 2035 if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS); 2036 PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight); 2037 a->sliceheight = sliceheight; 2038 #if defined(PETSC_HAVE_CUPM) 2039 PetscCheck(PetscMax(DEVICE_MEM_ALIGN, sliceheight) % PetscMin(DEVICE_MEM_ALIGN, sliceheight) == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The slice height is not compatible with DEVICE_MEM_ALIGN (one must be divisible by the other) %" PetscInt_FMT, sliceheight); 2040 #endif 2041 PetscFunctionReturn(PETSC_SUCCESS); 2042 } 2043 2044 /*@ 2045 MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix. 2046 2047 Not Collective 2048 2049 Input Parameter: 2050 . A - a MATSEQSELL matrix 2051 2052 Output Parameter: 2053 . ratio - ratio of number of padded zeros to number of allocated elements 2054 2055 Level: intermediate 2056 2057 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2058 @*/ 2059 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio) 2060 { 2061 PetscFunctionBegin; 2062 PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio)); 2063 PetscFunctionReturn(PETSC_SUCCESS); 2064 } 2065 2066 /*@ 2067 MatSeqSELLGetMaxSliceWidth - returns the maximum slice width. 2068 2069 Not Collective 2070 2071 Input Parameter: 2072 . A - a MATSEQSELL matrix 2073 2074 Output Parameter: 2075 . slicewidth - maximum slice width 2076 2077 Level: intermediate 2078 2079 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2080 @*/ 2081 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth) 2082 { 2083 PetscFunctionBegin; 2084 PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth)); 2085 PetscFunctionReturn(PETSC_SUCCESS); 2086 } 2087 2088 /*@ 2089 MatSeqSELLGetAvgSliceWidth - returns the average slice width. 2090 2091 Not Collective 2092 2093 Input Parameter: 2094 . A - a MATSEQSELL matrix 2095 2096 Output Parameter: 2097 . slicewidth - average slice width 2098 2099 Level: intermediate 2100 2101 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()` 2102 @*/ 2103 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth) 2104 { 2105 PetscFunctionBegin; 2106 PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth)); 2107 PetscFunctionReturn(PETSC_SUCCESS); 2108 } 2109 2110 /*@ 2111 MatSeqSELLSetSliceHeight - sets the slice height. 2112 2113 Not Collective 2114 2115 Input Parameters: 2116 + A - a MATSEQSELL matrix 2117 - sliceheight - slice height 2118 2119 Notes: 2120 You cannot change the slice height once it have been set. 2121 2122 The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called. 2123 2124 Level: intermediate 2125 2126 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()` 2127 @*/ 2128 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight) 2129 { 2130 PetscFunctionBegin; 2131 PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight)); 2132 PetscFunctionReturn(PETSC_SUCCESS); 2133 } 2134 2135 /*@ 2136 MatSeqSELLGetVarSliceSize - returns the variance of the slice size. 2137 2138 Not Collective 2139 2140 Input Parameter: 2141 . A - a MATSEQSELL matrix 2142 2143 Output Parameter: 2144 . variance - variance of the slice size 2145 2146 Level: intermediate 2147 2148 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()` 2149 @*/ 2150 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance) 2151 { 2152 PetscFunctionBegin; 2153 PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance)); 2154 PetscFunctionReturn(PETSC_SUCCESS); 2155 } 2156 2157 #if defined(PETSC_HAVE_CUDA) 2158 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 2159 #endif 2160 #if defined(PETSC_HAVE_HIP) 2161 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat); 2162 #endif 2163 2164 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 2165 { 2166 Mat_SeqSELL *b; 2167 PetscMPIInt size; 2168 2169 PetscFunctionBegin; 2170 PetscCall(PetscCitationsRegister(citation, &cited)); 2171 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 2172 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 2173 2174 PetscCall(PetscNew(&b)); 2175 2176 B->data = (void *)b; 2177 B->ops[0] = MatOps_Values; 2178 2179 b->row = NULL; 2180 b->col = NULL; 2181 b->icol = NULL; 2182 b->reallocs = 0; 2183 b->ignorezeroentries = PETSC_FALSE; 2184 b->roworiented = PETSC_TRUE; 2185 b->nonew = 0; 2186 b->diag = NULL; 2187 b->solve_work = NULL; 2188 B->spptr = NULL; 2189 b->saved_values = NULL; 2190 b->idiag = NULL; 2191 b->mdiag = NULL; 2192 b->ssor_work = NULL; 2193 b->omega = 1.0; 2194 b->fshift = 0.0; 2195 b->idiagvalid = PETSC_FALSE; 2196 b->keepnonzeropattern = PETSC_FALSE; 2197 b->sliceheight = 0; 2198 2199 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 2200 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 2201 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 2202 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 2203 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 2204 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 2205 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 2206 #if defined(PETSC_HAVE_CUDA) 2207 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA)); 2208 #endif 2209 #if defined(PETSC_HAVE_HIP) 2210 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellhip_C", MatConvert_SeqSELL_SeqSELLHIP)); 2211 #endif 2212 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL)); 2213 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL)); 2214 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL)); 2215 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL)); 2216 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL)); 2217 2218 PetscObjectOptionsBegin((PetscObject)B); 2219 { 2220 PetscInt newsh = -1; 2221 PetscBool flg; 2222 #if defined(PETSC_HAVE_CUPM) 2223 PetscInt chunksize = 0; 2224 #endif 2225 2226 PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg)); 2227 if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); } 2228 #if defined(PETSC_HAVE_CUPM) 2229 PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA/HIP kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg)); 2230 if (flg) { 2231 PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize); 2232 b->chunksize = chunksize; 2233 } 2234 #endif 2235 } 2236 PetscOptionsEnd(); 2237 PetscFunctionReturn(PETSC_SUCCESS); 2238 } 2239 2240 /* 2241 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 2242 */ 2243 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 2244 { 2245 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 2246 PetscInt i, m = A->rmap->n; 2247 PetscInt totalslices = a->totalslices; 2248 2249 PetscFunctionBegin; 2250 C->factortype = A->factortype; 2251 c->row = NULL; 2252 c->col = NULL; 2253 c->icol = NULL; 2254 c->reallocs = 0; 2255 C->assembled = PETSC_TRUE; 2256 2257 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2258 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2259 2260 c->sliceheight = a->sliceheight; 2261 PetscCall(PetscMalloc1(c->sliceheight * totalslices, &c->rlen)); 2262 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2263 2264 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2265 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2266 2267 /* allocate the matrix space */ 2268 if (mallocmatspace) { 2269 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2270 2271 c->singlemalloc = PETSC_TRUE; 2272 2273 if (m > 0) { 2274 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2275 if (cpvalues == MAT_COPY_VALUES) { 2276 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2277 } else { 2278 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2279 } 2280 } 2281 } 2282 2283 c->ignorezeroentries = a->ignorezeroentries; 2284 c->roworiented = a->roworiented; 2285 c->nonew = a->nonew; 2286 if (a->diag) { 2287 PetscCall(PetscMalloc1(m, &c->diag)); 2288 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2289 } else c->diag = NULL; 2290 2291 c->solve_work = NULL; 2292 c->saved_values = NULL; 2293 c->idiag = NULL; 2294 c->ssor_work = NULL; 2295 c->keepnonzeropattern = a->keepnonzeropattern; 2296 c->free_val = PETSC_TRUE; 2297 c->free_colidx = PETSC_TRUE; 2298 2299 c->maxallocmat = a->maxallocmat; 2300 c->maxallocrow = a->maxallocrow; 2301 c->rlenmax = a->rlenmax; 2302 c->nz = a->nz; 2303 C->preallocated = PETSC_TRUE; 2304 2305 c->nonzerorowcnt = a->nonzerorowcnt; 2306 C->nonzerostate = A->nonzerostate; 2307 2308 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2309 PetscFunctionReturn(PETSC_SUCCESS); 2310 } 2311 2312 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2313 { 2314 PetscFunctionBegin; 2315 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2316 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2317 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2318 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2319 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2320 PetscFunctionReturn(PETSC_SUCCESS); 2321 } 2322 2323 /*MC 2324 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2325 based on the sliced Ellpack format, {cite}`zhangellpack2018` 2326 2327 Options Database Key: 2328 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2329 2330 Level: beginner 2331 2332 .seealso: `Mat`, `MatCreateSeqSELL()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2333 M*/ 2334 2335 /*MC 2336 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018` 2337 2338 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2339 and `MATMPISELL` otherwise. As a result, for single process communicators, 2340 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2341 for communicators controlling multiple processes. It is recommended that you call both of 2342 the above preallocation routines for simplicity. 2343 2344 Options Database Key: 2345 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2346 2347 Level: beginner 2348 2349 Notes: 2350 This format is only supported for real scalars, double precision, and 32-bit indices (the defaults). 2351 2352 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2353 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2354 2355 Developer Notes: 2356 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2357 2358 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2359 .vb 2360 (2 0 3 4) 2361 Consider the matrix A = (5 0 6 0) 2362 (0 0 7 8) 2363 (0 0 9 9) 2364 2365 symbolically the Ellpack format can be written as 2366 2367 (2 3 4 |) (0 2 3 |) 2368 v = (5 6 0 |) colidx = (0 2 2 |) 2369 -------- --------- 2370 (7 8 |) (2 3 |) 2371 (9 9 |) (2 3 |) 2372 2373 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2374 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2375 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2376 2377 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2378 2379 .ve 2380 2381 See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance. 2382 2383 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSELL()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2384 M*/ 2385 2386 /*@ 2387 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2388 2389 Collective 2390 2391 Input Parameters: 2392 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2393 . m - number of rows 2394 . n - number of columns 2395 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided 2396 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL 2397 2398 Output Parameter: 2399 . A - the matrix 2400 2401 Level: intermediate 2402 2403 Notes: 2404 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2405 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2406 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2407 2408 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 2409 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 2410 allocation. 2411 2412 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL` 2413 @*/ 2414 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A) 2415 { 2416 PetscFunctionBegin; 2417 PetscCall(MatCreate(comm, A)); 2418 PetscCall(MatSetSizes(*A, m, n, m, n)); 2419 PetscCall(MatSetType(*A, MATSEQSELL)); 2420 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen)); 2421 PetscFunctionReturn(PETSC_SUCCESS); 2422 } 2423 2424 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2425 { 2426 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2427 PetscInt totalslices = a->totalslices; 2428 2429 PetscFunctionBegin; 2430 /* If the matrix dimensions are not equal,or no of nonzeros */ 2431 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2432 *flg = PETSC_FALSE; 2433 PetscFunctionReturn(PETSC_SUCCESS); 2434 } 2435 /* if the a->colidx are the same */ 2436 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2437 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2438 /* if a->val are the same */ 2439 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2440 PetscFunctionReturn(PETSC_SUCCESS); 2441 } 2442 2443 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2444 { 2445 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2446 2447 PetscFunctionBegin; 2448 a->idiagvalid = PETSC_FALSE; 2449 PetscFunctionReturn(PETSC_SUCCESS); 2450 } 2451 2452 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2453 { 2454 #if defined(PETSC_USE_COMPLEX) 2455 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2456 PetscInt i; 2457 PetscScalar *val = a->val; 2458 2459 PetscFunctionBegin; 2460 for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); } 2461 #if defined(PETSC_HAVE_CUPM) 2462 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 2463 #endif 2464 #else 2465 PetscFunctionBegin; 2466 #endif 2467 PetscFunctionReturn(PETSC_SUCCESS); 2468 } 2469