1 /* 2 Defines the basic matrix operations for the SELL matrix storage format. 3 */ 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 #include <petscblaslapack.h> 6 #include <petsc/private/kernels/blocktranspose.h> 7 8 static PetscBool cited = PETSC_FALSE; 9 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 10 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 11 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 12 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 13 " year = 2018\n" 14 "}\n"; 15 16 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 17 18 #include <immintrin.h> 19 20 #if !defined(_MM_SCALE_8) 21 #define _MM_SCALE_8 8 22 #endif 23 24 #if defined(__AVX512F__) 25 /* these do not work 26 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 27 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 28 */ 29 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 30 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 31 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 32 vec_vals = _mm512_loadu_pd(aval); \ 33 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 34 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 35 #elif defined(__AVX2__) && defined(__FMA__) 36 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 37 vec_vals = _mm256_loadu_pd(aval); \ 38 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 39 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 40 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 41 #endif 42 #endif /* PETSC_HAVE_IMMINTRIN_H */ 43 44 /*@ 45 MatSeqSELLSetPreallocation - For good matrix assembly performance 46 the user should preallocate the matrix storage by setting the parameter `nz` 47 (or the array `nnz`). 48 49 Collective 50 51 Input Parameters: 52 + B - The `MATSEQSELL` matrix 53 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided 54 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 55 56 Level: intermediate 57 58 Notes: 59 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 60 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 61 allocation. 62 63 You can call `MatGetInfo()` to get information on how effective the preallocation was; 64 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 65 You can also run with the option `-info` and look for messages with the string 66 malloc in them to see if additional memory allocation was needed. 67 68 Developer Notes: 69 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 70 entries or columns indices. 71 72 The maximum number of nonzeos in any row should be as accurate as possible. 73 If it is underestimated, you will get bad performance due to reallocation 74 (`MatSeqXSELLReallocateSELL()`). 75 76 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 77 @*/ 78 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 79 { 80 PetscFunctionBegin; 81 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 82 PetscValidType(B, 1); 83 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 84 PetscFunctionReturn(PETSC_SUCCESS); 85 } 86 87 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 88 { 89 Mat_SeqSELL *b; 90 PetscInt i, j, totalslices; 91 #if defined(PETSC_HAVE_CUPM) 92 PetscInt rlenmax = 0; 93 #endif 94 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 95 96 PetscFunctionBegin; 97 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 98 if (maxallocrow == MAT_SKIP_ALLOCATION) { 99 skipallocation = PETSC_TRUE; 100 maxallocrow = 0; 101 } 102 103 PetscCall(PetscLayoutSetUp(B->rmap)); 104 PetscCall(PetscLayoutSetUp(B->cmap)); 105 106 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 107 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 108 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 109 if (rlen) { 110 for (i = 0; i < B->rmap->n; i++) { 111 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 112 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 113 } 114 } 115 116 B->preallocated = PETSC_TRUE; 117 118 b = (Mat_SeqSELL *)B->data; 119 120 if (!b->sliceheight) { /* not set yet */ 121 #if defined(PETSC_HAVE_CUPM) 122 b->sliceheight = 16; 123 #else 124 b->sliceheight = 8; 125 #endif 126 } 127 totalslices = PetscCeilInt(B->rmap->n, b->sliceheight); 128 b->totalslices = totalslices; 129 if (!skipallocation) { 130 if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n)); 131 132 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 133 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 134 } 135 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 136 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 137 else if (maxallocrow < 0) maxallocrow = 1; 138 #if defined(PETSC_HAVE_CUPM) 139 rlenmax = maxallocrow; 140 /* Pad the slice to DEVICE_MEM_ALIGN */ 141 while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++; 142 #endif 143 for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow; 144 } else { 145 #if defined(PETSC_HAVE_CUPM) 146 PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight; 147 #endif 148 maxallocrow = 0; 149 b->sliidx[0] = 0; 150 for (i = 1; i < totalslices; i++) { 151 b->sliidx[i] = 0; 152 for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); } 153 #if defined(PETSC_HAVE_CUPM) 154 if (mul != 0) { /* Pad the slice to DEVICE_MEM_ALIGN if sliceheight < DEVICE_MEM_ALIGN */ 155 rlenmax = PetscMax(b->sliidx[i], rlenmax); 156 b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul; 157 } 158 #endif 159 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 160 PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i])); 161 } 162 /* last slice */ 163 b->sliidx[totalslices] = 0; 164 for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 165 #if defined(PETSC_HAVE_CUPM) 166 if (mul != 0) { 167 rlenmax = PetscMax(b->sliidx[i], rlenmax); 168 b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul; 169 } 170 #endif 171 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 172 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices]; 173 } 174 175 /* allocate space for val, colidx, rlen */ 176 /* FIXME: should B's old memory be unlogged? */ 177 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 178 /* FIXME: assuming an element of the bit array takes 8 bits */ 179 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 180 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 181 PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen)); 182 183 b->singlemalloc = PETSC_TRUE; 184 b->free_val = PETSC_TRUE; 185 b->free_colidx = PETSC_TRUE; 186 } else { 187 b->free_val = PETSC_FALSE; 188 b->free_colidx = PETSC_FALSE; 189 } 190 191 b->nz = 0; 192 b->maxallocrow = maxallocrow; 193 #if defined(PETSC_HAVE_CUPM) 194 b->rlenmax = rlenmax; 195 #else 196 b->rlenmax = maxallocrow; 197 #endif 198 b->maxallocmat = b->sliidx[totalslices]; 199 B->info.nz_unneeded = (double)b->maxallocmat; 200 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 201 PetscFunctionReturn(PETSC_SUCCESS); 202 } 203 204 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 205 { 206 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 207 PetscInt shift; 208 209 PetscFunctionBegin; 210 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 211 if (nz) *nz = a->rlen[row]; 212 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); 213 if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); } 214 if (idx) { 215 PetscInt j; 216 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j]; 217 *idx = a->getrowcols; 218 } 219 if (v) { 220 PetscInt j; 221 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j]; 222 *v = a->getrowvals; 223 } 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 228 { 229 PetscFunctionBegin; 230 PetscFunctionReturn(PETSC_SUCCESS); 231 } 232 233 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 234 { 235 Mat B; 236 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 237 PetscInt i; 238 239 PetscFunctionBegin; 240 if (reuse == MAT_REUSE_MATRIX) { 241 B = *newmat; 242 PetscCall(MatZeroEntries(B)); 243 } else { 244 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 245 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 246 PetscCall(MatSetType(B, MATSEQAIJ)); 247 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 248 } 249 250 for (i = 0; i < A->rmap->n; i++) { 251 PetscInt nz = 0, *cols = NULL; 252 PetscScalar *vals = NULL; 253 254 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 255 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 256 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 257 } 258 259 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 260 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 261 B->rmap->bs = A->rmap->bs; 262 263 if (reuse == MAT_INPLACE_MATRIX) { 264 PetscCall(MatHeaderReplace(A, &B)); 265 } else { 266 *newmat = B; 267 } 268 PetscFunctionReturn(PETSC_SUCCESS); 269 } 270 271 #include <../src/mat/impls/aij/seq/aij.h> 272 273 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 274 { 275 Mat B; 276 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 277 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 278 const PetscInt *cols; 279 const PetscScalar *vals; 280 281 PetscFunctionBegin; 282 if (reuse == MAT_REUSE_MATRIX) { 283 B = *newmat; 284 } else { 285 if (PetscDefined(USE_DEBUG) || !a->ilen) { 286 PetscCall(PetscMalloc1(m, &rowlengths)); 287 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 288 } 289 if (PetscDefined(USE_DEBUG) && a->ilen) { 290 PetscBool eq; 291 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 292 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 293 PetscCall(PetscFree(rowlengths)); 294 rowlengths = a->ilen; 295 } else if (a->ilen) rowlengths = a->ilen; 296 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 297 PetscCall(MatSetSizes(B, m, n, m, n)); 298 PetscCall(MatSetType(B, MATSEQSELL)); 299 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 300 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 301 } 302 303 for (row = 0; row < m; row++) { 304 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 305 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 306 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 307 } 308 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 309 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 310 B->rmap->bs = A->rmap->bs; 311 312 if (reuse == MAT_INPLACE_MATRIX) { 313 PetscCall(MatHeaderReplace(A, &B)); 314 } else { 315 *newmat = B; 316 } 317 PetscFunctionReturn(PETSC_SUCCESS); 318 } 319 320 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 321 { 322 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 323 PetscScalar *y; 324 const PetscScalar *x; 325 const MatScalar *aval = a->val; 326 PetscInt totalslices = a->totalslices; 327 const PetscInt *acolidx = a->colidx; 328 PetscInt i, j; 329 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 330 __m512d vec_x, vec_y, vec_vals; 331 __m256i vec_idx; 332 __mmask8 mask; 333 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 334 __m256i vec_idx2, vec_idx3, vec_idx4; 335 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 336 __m128i vec_idx; 337 __m256d vec_x, vec_y, vec_y2, vec_vals; 338 MatScalar yval; 339 PetscInt r, rows_left, row, nnz_in_row; 340 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 341 __m128d vec_x_tmp; 342 __m256d vec_x, vec_y, vec_y2, vec_vals; 343 MatScalar yval; 344 PetscInt r, rows_left, row, nnz_in_row; 345 #else 346 PetscInt k, sliceheight = a->sliceheight; 347 PetscScalar *sum; 348 #endif 349 350 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 351 #pragma disjoint(*x, *y, *aval) 352 #endif 353 354 PetscFunctionBegin; 355 PetscCall(VecGetArrayRead(xx, &x)); 356 PetscCall(VecGetArray(yy, &y)); 357 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 358 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 359 for (i = 0; i < totalslices; i++) { /* loop over slices */ 360 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 361 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 362 363 vec_y = _mm512_setzero_pd(); 364 vec_y2 = _mm512_setzero_pd(); 365 vec_y3 = _mm512_setzero_pd(); 366 vec_y4 = _mm512_setzero_pd(); 367 368 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 369 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 370 case 3: 371 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 372 acolidx += 8; 373 aval += 8; 374 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 375 acolidx += 8; 376 aval += 8; 377 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 378 acolidx += 8; 379 aval += 8; 380 j += 3; 381 break; 382 case 2: 383 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 384 acolidx += 8; 385 aval += 8; 386 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 387 acolidx += 8; 388 aval += 8; 389 j += 2; 390 break; 391 case 1: 392 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 393 acolidx += 8; 394 aval += 8; 395 j += 1; 396 break; 397 } 398 #pragma novector 399 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 400 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 401 acolidx += 8; 402 aval += 8; 403 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 404 acolidx += 8; 405 aval += 8; 406 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 407 acolidx += 8; 408 aval += 8; 409 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 410 acolidx += 8; 411 aval += 8; 412 } 413 414 vec_y = _mm512_add_pd(vec_y, vec_y2); 415 vec_y = _mm512_add_pd(vec_y, vec_y3); 416 vec_y = _mm512_add_pd(vec_y, vec_y4); 417 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 418 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 419 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 420 } else { 421 _mm512_storeu_pd(&y[8 * i], vec_y); 422 } 423 } 424 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 425 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 426 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 427 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 428 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 429 430 /* last slice may have padding rows. Don't use vectorization. */ 431 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 432 rows_left = A->rmap->n - 8 * i; 433 for (r = 0; r < rows_left; ++r) { 434 yval = (MatScalar)0; 435 row = 8 * i + r; 436 nnz_in_row = a->rlen[row]; 437 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 438 y[row] = yval; 439 } 440 break; 441 } 442 443 vec_y = _mm256_setzero_pd(); 444 vec_y2 = _mm256_setzero_pd(); 445 446 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 447 #pragma novector 448 #pragma unroll(2) 449 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 450 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 451 aval += 4; 452 acolidx += 4; 453 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 454 aval += 4; 455 acolidx += 4; 456 } 457 458 _mm256_storeu_pd(y + i * 8, vec_y); 459 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 460 } 461 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 462 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 463 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 464 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 465 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 466 467 vec_y = _mm256_setzero_pd(); 468 vec_y2 = _mm256_setzero_pd(); 469 470 /* last slice may have padding rows. Don't use vectorization. */ 471 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 472 rows_left = A->rmap->n - 8 * i; 473 for (r = 0; r < rows_left; ++r) { 474 yval = (MatScalar)0; 475 row = 8 * i + r; 476 nnz_in_row = a->rlen[row]; 477 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 478 y[row] = yval; 479 } 480 break; 481 } 482 483 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 484 #pragma novector 485 #pragma unroll(2) 486 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 487 vec_vals = _mm256_loadu_pd(aval); 488 vec_x_tmp = _mm_setzero_pd(); 489 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 490 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 491 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 492 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 493 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 494 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 495 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 496 aval += 4; 497 498 vec_vals = _mm256_loadu_pd(aval); 499 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 500 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 501 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 502 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 503 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 504 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 505 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 506 aval += 4; 507 } 508 509 _mm256_storeu_pd(y + i * 8, vec_y); 510 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 511 } 512 #else 513 PetscCall(PetscMalloc1(sliceheight, &sum)); 514 for (i = 0; i < totalslices; i++) { /* loop over slices */ 515 for (j = 0; j < sliceheight; j++) { 516 sum[j] = 0.0; 517 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 518 } 519 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */ 520 for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j]; 521 } else { 522 for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j]; 523 } 524 } 525 PetscCall(PetscFree(sum)); 526 #endif 527 528 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 529 PetscCall(VecRestoreArrayRead(xx, &x)); 530 PetscCall(VecRestoreArray(yy, &y)); 531 PetscFunctionReturn(PETSC_SUCCESS); 532 } 533 534 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 535 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 536 { 537 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 538 PetscScalar *y, *z; 539 const PetscScalar *x; 540 const MatScalar *aval = a->val; 541 PetscInt totalslices = a->totalslices; 542 const PetscInt *acolidx = a->colidx; 543 PetscInt i, j; 544 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 545 __m512d vec_x, vec_y, vec_vals; 546 __m256i vec_idx; 547 __mmask8 mask = 0; 548 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 549 __m256i vec_idx2, vec_idx3, vec_idx4; 550 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 551 __m128d vec_x_tmp; 552 __m256d vec_x, vec_y, vec_y2, vec_vals; 553 MatScalar yval; 554 PetscInt r, row, nnz_in_row; 555 #else 556 PetscInt k, sliceheight = a->sliceheight; 557 PetscScalar *sum; 558 #endif 559 560 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 561 #pragma disjoint(*x, *y, *aval) 562 #endif 563 564 PetscFunctionBegin; 565 if (!a->nz) { 566 PetscCall(VecCopy(yy, zz)); 567 PetscFunctionReturn(PETSC_SUCCESS); 568 } 569 PetscCall(VecGetArrayRead(xx, &x)); 570 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 571 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 572 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 573 for (i = 0; i < totalslices; i++) { /* loop over slices */ 574 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 575 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 576 577 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 578 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 579 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 580 } else { 581 vec_y = _mm512_loadu_pd(&y[8 * i]); 582 } 583 vec_y2 = _mm512_setzero_pd(); 584 vec_y3 = _mm512_setzero_pd(); 585 vec_y4 = _mm512_setzero_pd(); 586 587 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 588 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 589 case 3: 590 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 591 acolidx += 8; 592 aval += 8; 593 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 594 acolidx += 8; 595 aval += 8; 596 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 597 acolidx += 8; 598 aval += 8; 599 j += 3; 600 break; 601 case 2: 602 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 603 acolidx += 8; 604 aval += 8; 605 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 606 acolidx += 8; 607 aval += 8; 608 j += 2; 609 break; 610 case 1: 611 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 612 acolidx += 8; 613 aval += 8; 614 j += 1; 615 break; 616 } 617 #pragma novector 618 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 619 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 620 acolidx += 8; 621 aval += 8; 622 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 623 acolidx += 8; 624 aval += 8; 625 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 626 acolidx += 8; 627 aval += 8; 628 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 629 acolidx += 8; 630 aval += 8; 631 } 632 633 vec_y = _mm512_add_pd(vec_y, vec_y2); 634 vec_y = _mm512_add_pd(vec_y, vec_y3); 635 vec_y = _mm512_add_pd(vec_y, vec_y4); 636 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 637 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 638 } else { 639 _mm512_storeu_pd(&z[8 * i], vec_y); 640 } 641 } 642 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 643 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 644 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 645 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 646 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 647 648 /* last slice may have padding rows. Don't use vectorization. */ 649 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 650 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 651 row = 8 * i + r; 652 yval = (MatScalar)0.0; 653 nnz_in_row = a->rlen[row]; 654 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 655 z[row] = y[row] + yval; 656 } 657 break; 658 } 659 660 vec_y = _mm256_loadu_pd(y + 8 * i); 661 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 662 663 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 664 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 665 vec_vals = _mm256_loadu_pd(aval); 666 vec_x_tmp = _mm_setzero_pd(); 667 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 668 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 669 vec_x = _mm256_setzero_pd(); 670 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 671 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 672 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 673 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 674 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 675 aval += 4; 676 677 vec_vals = _mm256_loadu_pd(aval); 678 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 679 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 680 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 681 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 682 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 683 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 684 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 685 aval += 4; 686 } 687 688 _mm256_storeu_pd(z + i * 8, vec_y); 689 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 690 } 691 #else 692 PetscCall(PetscMalloc1(sliceheight, &sum)); 693 for (i = 0; i < totalslices; i++) { /* loop over slices */ 694 for (j = 0; j < sliceheight; j++) { 695 sum[j] = 0.0; 696 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 697 } 698 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 699 for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 700 } else { 701 for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 702 } 703 } 704 PetscCall(PetscFree(sum)); 705 #endif 706 707 PetscCall(PetscLogFlops(2.0 * a->nz)); 708 PetscCall(VecRestoreArrayRead(xx, &x)); 709 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 710 PetscFunctionReturn(PETSC_SUCCESS); 711 } 712 713 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 714 { 715 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 716 PetscScalar *y; 717 const PetscScalar *x; 718 const MatScalar *aval = a->val; 719 const PetscInt *acolidx = a->colidx; 720 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight; 721 722 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 723 #pragma disjoint(*x, *y, *aval) 724 #endif 725 726 PetscFunctionBegin; 727 if (A->symmetric == PETSC_BOOL3_TRUE) { 728 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 729 PetscFunctionReturn(PETSC_SUCCESS); 730 } 731 if (zz != yy) PetscCall(VecCopy(zz, yy)); 732 733 if (a->nz) { 734 PetscCall(VecGetArrayRead(xx, &x)); 735 PetscCall(VecGetArray(yy, &y)); 736 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 737 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 738 for (r = 0; r < (A->rmap->n % sliceheight); ++r) { 739 row = sliceheight * i + r; 740 nnz_in_row = a->rlen[row]; 741 for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row]; 742 } 743 break; 744 } 745 for (r = 0; r < sliceheight; ++r) 746 for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r]; 747 } 748 PetscCall(PetscLogFlops(2.0 * a->nz)); 749 PetscCall(VecRestoreArrayRead(xx, &x)); 750 PetscCall(VecRestoreArray(yy, &y)); 751 } 752 PetscFunctionReturn(PETSC_SUCCESS); 753 } 754 755 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 756 { 757 PetscFunctionBegin; 758 if (A->symmetric == PETSC_BOOL3_TRUE) { 759 PetscCall(MatMult_SeqSELL(A, xx, yy)); 760 } else { 761 PetscCall(VecSet(yy, 0.0)); 762 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 763 } 764 PetscFunctionReturn(PETSC_SUCCESS); 765 } 766 767 /* 768 Checks for missing diagonals 769 */ 770 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 771 { 772 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 773 PetscInt *diag, i; 774 775 PetscFunctionBegin; 776 *missing = PETSC_FALSE; 777 if (A->rmap->n > 0 && !a->colidx) { 778 *missing = PETSC_TRUE; 779 if (d) *d = 0; 780 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 781 } else { 782 diag = a->diag; 783 for (i = 0; i < A->rmap->n; i++) { 784 if (diag[i] == -1) { 785 *missing = PETSC_TRUE; 786 if (d) *d = i; 787 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 788 break; 789 } 790 } 791 } 792 PetscFunctionReturn(PETSC_SUCCESS); 793 } 794 795 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 796 { 797 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 798 PetscInt i, j, m = A->rmap->n, shift; 799 800 PetscFunctionBegin; 801 if (!a->diag) { 802 PetscCall(PetscMalloc1(m, &a->diag)); 803 a->free_diag = PETSC_TRUE; 804 } 805 for (i = 0; i < m; i++) { /* loop over rows */ 806 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 807 a->diag[i] = -1; 808 for (j = 0; j < a->rlen[i]; j++) { 809 if (a->colidx[shift + a->sliceheight * j] == i) { 810 a->diag[i] = shift + a->sliceheight * j; 811 break; 812 } 813 } 814 } 815 PetscFunctionReturn(PETSC_SUCCESS); 816 } 817 818 /* 819 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 820 */ 821 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 822 { 823 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 824 PetscInt i, *diag, m = A->rmap->n; 825 MatScalar *val = a->val; 826 PetscScalar *idiag, *mdiag; 827 828 PetscFunctionBegin; 829 if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS); 830 PetscCall(MatMarkDiagonal_SeqSELL(A)); 831 diag = a->diag; 832 if (!a->idiag) { 833 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 834 val = a->val; 835 } 836 mdiag = a->mdiag; 837 idiag = a->idiag; 838 839 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 840 for (i = 0; i < m; i++) { 841 mdiag[i] = val[diag[i]]; 842 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 843 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 844 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 845 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 846 A->factorerror_zeropivot_value = 0.0; 847 A->factorerror_zeropivot_row = i; 848 } 849 idiag[i] = 1.0 / val[diag[i]]; 850 } 851 PetscCall(PetscLogFlops(m)); 852 } else { 853 for (i = 0; i < m; i++) { 854 mdiag[i] = val[diag[i]]; 855 idiag[i] = omega / (fshift + val[diag[i]]); 856 } 857 PetscCall(PetscLogFlops(2.0 * m)); 858 } 859 a->idiagvalid = PETSC_TRUE; 860 PetscFunctionReturn(PETSC_SUCCESS); 861 } 862 863 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 864 { 865 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 866 867 PetscFunctionBegin; 868 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 869 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 870 PetscFunctionReturn(PETSC_SUCCESS); 871 } 872 873 PetscErrorCode MatDestroy_SeqSELL(Mat A) 874 { 875 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 876 877 PetscFunctionBegin; 878 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 879 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 880 PetscCall(ISDestroy(&a->row)); 881 PetscCall(ISDestroy(&a->col)); 882 PetscCall(PetscFree(a->diag)); 883 PetscCall(PetscFree(a->rlen)); 884 PetscCall(PetscFree(a->sliidx)); 885 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 886 PetscCall(PetscFree(a->solve_work)); 887 PetscCall(ISDestroy(&a->icol)); 888 PetscCall(PetscFree(a->saved_values)); 889 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 890 PetscCall(PetscFree(A->data)); 891 #if defined(PETSC_HAVE_CUPM) 892 PetscCall(PetscFree(a->chunk_slice_map)); 893 #endif 894 895 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 896 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 897 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 898 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 899 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 900 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 901 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 902 #if defined(PETSC_HAVE_CUDA) 903 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL)); 904 #endif 905 #if defined(PETSC_HAVE_HIP) 906 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellhip_C", NULL)); 907 #endif 908 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL)); 909 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL)); 910 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL)); 911 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL)); 912 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL)); 913 PetscFunctionReturn(PETSC_SUCCESS); 914 } 915 916 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 917 { 918 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 919 920 PetscFunctionBegin; 921 switch (op) { 922 case MAT_ROW_ORIENTED: 923 a->roworiented = flg; 924 break; 925 case MAT_KEEP_NONZERO_PATTERN: 926 a->keepnonzeropattern = flg; 927 break; 928 case MAT_NEW_NONZERO_LOCATIONS: 929 a->nonew = (flg ? 0 : 1); 930 break; 931 case MAT_NEW_NONZERO_LOCATION_ERR: 932 a->nonew = (flg ? -1 : 0); 933 break; 934 case MAT_NEW_NONZERO_ALLOCATION_ERR: 935 a->nonew = (flg ? -2 : 0); 936 break; 937 case MAT_UNUSED_NONZERO_LOCATION_ERR: 938 a->nounused = (flg ? -1 : 0); 939 break; 940 default: 941 break; 942 } 943 PetscFunctionReturn(PETSC_SUCCESS); 944 } 945 946 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 947 { 948 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 949 PetscInt i, j, n, shift; 950 PetscScalar *x, zero = 0.0; 951 952 PetscFunctionBegin; 953 PetscCall(VecGetLocalSize(v, &n)); 954 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 955 956 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 957 PetscInt *diag = a->diag; 958 PetscCall(VecGetArray(v, &x)); 959 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 960 PetscCall(VecRestoreArray(v, &x)); 961 PetscFunctionReturn(PETSC_SUCCESS); 962 } 963 964 PetscCall(VecSet(v, zero)); 965 PetscCall(VecGetArray(v, &x)); 966 for (i = 0; i < n; i++) { /* loop over rows */ 967 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 968 x[i] = 0; 969 for (j = 0; j < a->rlen[i]; j++) { 970 if (a->colidx[shift + a->sliceheight * j] == i) { 971 x[i] = a->val[shift + a->sliceheight * j]; 972 break; 973 } 974 } 975 } 976 PetscCall(VecRestoreArray(v, &x)); 977 PetscFunctionReturn(PETSC_SUCCESS); 978 } 979 980 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 981 { 982 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 983 const PetscScalar *l, *r; 984 PetscInt i, j, m, n, row; 985 986 PetscFunctionBegin; 987 if (ll) { 988 /* The local size is used so that VecMPI can be passed to this routine 989 by MatDiagonalScale_MPISELL */ 990 PetscCall(VecGetLocalSize(ll, &m)); 991 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 992 PetscCall(VecGetArrayRead(ll, &l)); 993 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 994 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 995 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 996 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row]; 997 } 998 } else { 999 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; } 1000 } 1001 } 1002 PetscCall(VecRestoreArrayRead(ll, &l)); 1003 PetscCall(PetscLogFlops(a->nz)); 1004 } 1005 if (rr) { 1006 PetscCall(VecGetLocalSize(rr, &n)); 1007 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1008 PetscCall(VecGetArrayRead(rr, &r)); 1009 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1010 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1011 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) { 1012 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]]; 1013 } 1014 } else { 1015 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 1016 } 1017 } 1018 PetscCall(VecRestoreArrayRead(rr, &r)); 1019 PetscCall(PetscLogFlops(a->nz)); 1020 } 1021 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1022 #if defined(PETSC_HAVE_CUPM) 1023 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1024 #endif 1025 PetscFunctionReturn(PETSC_SUCCESS); 1026 } 1027 1028 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1029 { 1030 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1031 PetscInt *cp, i, k, low, high, t, row, col, l; 1032 PetscInt shift; 1033 MatScalar *vp; 1034 1035 PetscFunctionBegin; 1036 for (k = 0; k < m; k++) { /* loop over requested rows */ 1037 row = im[k]; 1038 if (row < 0) continue; 1039 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1040 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */ 1041 cp = a->colidx + shift; /* pointer to the row */ 1042 vp = a->val + shift; /* pointer to the row */ 1043 for (l = 0; l < n; l++) { /* loop over requested columns */ 1044 col = in[l]; 1045 if (col < 0) continue; 1046 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1047 high = a->rlen[row]; 1048 low = 0; /* assume unsorted */ 1049 while (high - low > 5) { 1050 t = (low + high) / 2; 1051 if (*(cp + a->sliceheight * t) > col) high = t; 1052 else low = t; 1053 } 1054 for (i = low; i < high; i++) { 1055 if (*(cp + a->sliceheight * i) > col) break; 1056 if (*(cp + a->sliceheight * i) == col) { 1057 *v++ = *(vp + a->sliceheight * i); 1058 goto finished; 1059 } 1060 } 1061 *v++ = 0.0; 1062 finished:; 1063 } 1064 } 1065 PetscFunctionReturn(PETSC_SUCCESS); 1066 } 1067 1068 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1069 { 1070 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1071 PetscInt i, j, m = A->rmap->n, shift; 1072 const char *name; 1073 PetscViewerFormat format; 1074 1075 PetscFunctionBegin; 1076 PetscCall(PetscViewerGetFormat(viewer, &format)); 1077 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1078 PetscInt nofinalvalue = 0; 1079 /* 1080 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1081 nofinalvalue = 1; 1082 } 1083 */ 1084 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1085 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1086 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1087 #if defined(PETSC_USE_COMPLEX) 1088 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1089 #else 1090 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1091 #endif 1092 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1093 1094 for (i = 0; i < m; i++) { 1095 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1096 for (j = 0; j < a->rlen[i]; j++) { 1097 #if defined(PETSC_USE_COMPLEX) 1098 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1099 #else 1100 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j])); 1101 #endif 1102 } 1103 } 1104 /* 1105 if (nofinalvalue) { 1106 #if defined(PETSC_USE_COMPLEX) 1107 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1108 #else 1109 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1110 #endif 1111 } 1112 */ 1113 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1114 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1115 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1116 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1117 PetscFunctionReturn(PETSC_SUCCESS); 1118 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1119 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1120 for (i = 0; i < m; i++) { 1121 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1122 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1123 for (j = 0; j < a->rlen[i]; j++) { 1124 #if defined(PETSC_USE_COMPLEX) 1125 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1126 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1127 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1128 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1129 } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1130 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1131 } 1132 #else 1133 if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1134 #endif 1135 } 1136 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1137 } 1138 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1139 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1140 PetscInt cnt = 0, jcnt; 1141 PetscScalar value; 1142 #if defined(PETSC_USE_COMPLEX) 1143 PetscBool realonly = PETSC_TRUE; 1144 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1145 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1146 realonly = PETSC_FALSE; 1147 break; 1148 } 1149 } 1150 #endif 1151 1152 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1153 for (i = 0; i < m; i++) { 1154 jcnt = 0; 1155 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1156 for (j = 0; j < A->cmap->n; j++) { 1157 if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) { 1158 value = a->val[cnt++]; 1159 jcnt++; 1160 } else { 1161 value = 0.0; 1162 } 1163 #if defined(PETSC_USE_COMPLEX) 1164 if (realonly) { 1165 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1166 } else { 1167 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1168 } 1169 #else 1170 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1171 #endif 1172 } 1173 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1174 } 1175 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1176 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1177 PetscInt fshift = 1; 1178 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1179 #if defined(PETSC_USE_COMPLEX) 1180 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1181 #else 1182 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1183 #endif 1184 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1185 for (i = 0; i < m; i++) { 1186 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1187 for (j = 0; j < a->rlen[i]; j++) { 1188 #if defined(PETSC_USE_COMPLEX) 1189 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1190 #else 1191 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j])); 1192 #endif 1193 } 1194 } 1195 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1196 } else if (format == PETSC_VIEWER_NATIVE) { 1197 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1198 PetscInt row; 1199 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1200 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1201 #if defined(PETSC_USE_COMPLEX) 1202 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1203 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1204 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1205 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1206 } else { 1207 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1208 } 1209 #else 1210 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j])); 1211 #endif 1212 } 1213 } 1214 } else { 1215 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1216 if (A->factortype) { 1217 for (i = 0; i < m; i++) { 1218 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1219 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1220 /* L part */ 1221 for (j = shift; j < a->diag[i]; j += a->sliceheight) { 1222 #if defined(PETSC_USE_COMPLEX) 1223 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) { 1224 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1225 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) { 1226 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1227 } else { 1228 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1229 } 1230 #else 1231 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1232 #endif 1233 } 1234 /* diagonal */ 1235 j = a->diag[i]; 1236 #if defined(PETSC_USE_COMPLEX) 1237 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1238 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1239 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1240 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1241 } else { 1242 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1243 } 1244 #else 1245 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1 / a->val[j]))); 1246 #endif 1247 1248 /* U part */ 1249 for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) { 1250 #if defined(PETSC_USE_COMPLEX) 1251 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1252 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1253 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1254 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1255 } else { 1256 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1257 } 1258 #else 1259 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1260 #endif 1261 } 1262 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1263 } 1264 } else { 1265 for (i = 0; i < m; i++) { 1266 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1267 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1268 for (j = 0; j < a->rlen[i]; j++) { 1269 #if defined(PETSC_USE_COMPLEX) 1270 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1271 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1272 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1273 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1274 } else { 1275 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1276 } 1277 #else 1278 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1279 #endif 1280 } 1281 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1282 } 1283 } 1284 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1285 } 1286 PetscCall(PetscViewerFlush(viewer)); 1287 PetscFunctionReturn(PETSC_SUCCESS); 1288 } 1289 1290 #include <petscdraw.h> 1291 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1292 { 1293 Mat A = (Mat)Aa; 1294 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1295 PetscInt i, j, m = A->rmap->n, shift; 1296 int color; 1297 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1298 PetscViewer viewer; 1299 PetscViewerFormat format; 1300 1301 PetscFunctionBegin; 1302 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1303 PetscCall(PetscViewerGetFormat(viewer, &format)); 1304 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1305 1306 /* loop over matrix elements drawing boxes */ 1307 1308 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1309 PetscDrawCollectiveBegin(draw); 1310 /* Blue for negative, Cyan for zero and Red for positive */ 1311 color = PETSC_DRAW_BLUE; 1312 for (i = 0; i < m; i++) { 1313 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1314 y_l = m - i - 1.0; 1315 y_r = y_l + 1.0; 1316 for (j = 0; j < a->rlen[i]; j++) { 1317 x_l = a->colidx[shift + a->sliceheight * j]; 1318 x_r = x_l + 1.0; 1319 if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue; 1320 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1321 } 1322 } 1323 color = PETSC_DRAW_CYAN; 1324 for (i = 0; i < m; i++) { 1325 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1326 y_l = m - i - 1.0; 1327 y_r = y_l + 1.0; 1328 for (j = 0; j < a->rlen[i]; j++) { 1329 x_l = a->colidx[shift + a->sliceheight * j]; 1330 x_r = x_l + 1.0; 1331 if (a->val[shift + a->sliceheight * j] != 0.) continue; 1332 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1333 } 1334 } 1335 color = PETSC_DRAW_RED; 1336 for (i = 0; i < m; i++) { 1337 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1338 y_l = m - i - 1.0; 1339 y_r = y_l + 1.0; 1340 for (j = 0; j < a->rlen[i]; j++) { 1341 x_l = a->colidx[shift + a->sliceheight * j]; 1342 x_r = x_l + 1.0; 1343 if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue; 1344 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1345 } 1346 } 1347 PetscDrawCollectiveEnd(draw); 1348 } else { 1349 /* use contour shading to indicate magnitude of values */ 1350 /* first determine max of all nonzero values */ 1351 PetscReal minv = 0.0, maxv = 0.0; 1352 PetscInt count = 0; 1353 PetscDraw popup; 1354 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1355 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1356 } 1357 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1358 PetscCall(PetscDrawGetPopup(draw, &popup)); 1359 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1360 1361 PetscDrawCollectiveBegin(draw); 1362 for (i = 0; i < m; i++) { 1363 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1364 y_l = m - i - 1.0; 1365 y_r = y_l + 1.0; 1366 for (j = 0; j < a->rlen[i]; j++) { 1367 x_l = a->colidx[shift + a->sliceheight * j]; 1368 x_r = x_l + 1.0; 1369 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1370 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1371 count++; 1372 } 1373 } 1374 PetscDrawCollectiveEnd(draw); 1375 } 1376 PetscFunctionReturn(PETSC_SUCCESS); 1377 } 1378 1379 #include <petscdraw.h> 1380 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1381 { 1382 PetscDraw draw; 1383 PetscReal xr, yr, xl, yl, h, w; 1384 PetscBool isnull; 1385 1386 PetscFunctionBegin; 1387 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1388 PetscCall(PetscDrawIsNull(draw, &isnull)); 1389 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1390 1391 xr = A->cmap->n; 1392 yr = A->rmap->n; 1393 h = yr / 10.0; 1394 w = xr / 10.0; 1395 xr += w; 1396 yr += h; 1397 xl = -w; 1398 yl = -h; 1399 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1400 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1401 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1402 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1403 PetscCall(PetscDrawSave(draw)); 1404 PetscFunctionReturn(PETSC_SUCCESS); 1405 } 1406 1407 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1408 { 1409 PetscBool iascii, isbinary, isdraw; 1410 1411 PetscFunctionBegin; 1412 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1413 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1414 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1415 if (iascii) { 1416 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1417 } else if (isbinary) { 1418 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1419 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1420 PetscFunctionReturn(PETSC_SUCCESS); 1421 } 1422 1423 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1424 { 1425 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1426 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1427 MatScalar *vp; 1428 #if defined(PETSC_HAVE_CUPM) 1429 PetscInt totalchunks = 0; 1430 #endif 1431 1432 PetscFunctionBegin; 1433 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1434 /* To do: compress out the unused elements */ 1435 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1436 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1437 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1438 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1439 a->nonzerorowcnt = 0; 1440 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1441 for (i = 0; i < a->totalslices; ++i) { 1442 shift = a->sliidx[i]; /* starting index of the slice */ 1443 cp = PetscSafePointerPlusOffset(a->colidx, shift); /* pointer to the column indices of the slice */ 1444 vp = PetscSafePointerPlusOffset(a->val, shift); /* pointer to the nonzero values of the slice */ 1445 for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */ 1446 row = a->sliceheight * i + row_in_slice; 1447 nrow = a->rlen[row]; /* number of nonzeros in row */ 1448 /* 1449 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1450 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1451 */ 1452 lastcol = 0; 1453 if (nrow > 0) { /* nonempty row */ 1454 a->nonzerorowcnt++; 1455 lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1456 } else if (!row_in_slice) { /* first row of the correct slice is empty */ 1457 for (j = 1; j < a->sliceheight; j++) { 1458 if (a->rlen[a->sliceheight * i + j]) { 1459 lastcol = cp[j]; 1460 break; 1461 } 1462 } 1463 } else { 1464 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1465 } 1466 1467 for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) { 1468 cp[a->sliceheight * k + row_in_slice] = lastcol; 1469 vp[a->sliceheight * k + row_in_slice] = (MatScalar)0; 1470 } 1471 } 1472 } 1473 1474 A->info.mallocs += a->reallocs; 1475 a->reallocs = 0; 1476 1477 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1478 #if defined(PETSC_HAVE_CUPM) 1479 if (!a->chunksize && a->totalslices) { 1480 a->chunksize = 64; 1481 while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2; 1482 totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize; 1483 } 1484 if (totalchunks != a->totalchunks) { 1485 PetscCall(PetscFree(a->chunk_slice_map)); 1486 PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map)); 1487 a->totalchunks = totalchunks; 1488 } 1489 j = 0; 1490 for (i = 0; i < totalchunks; i++) { 1491 while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++; 1492 a->chunk_slice_map[i] = j; 1493 } 1494 #endif 1495 PetscFunctionReturn(PETSC_SUCCESS); 1496 } 1497 1498 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1499 { 1500 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1501 1502 PetscFunctionBegin; 1503 info->block_size = 1.0; 1504 info->nz_allocated = a->maxallocmat; 1505 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1506 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1507 info->assemblies = A->num_ass; 1508 info->mallocs = A->info.mallocs; 1509 info->memory = 0; /* REVIEW ME */ 1510 if (A->factortype) { 1511 info->fill_ratio_given = A->info.fill_ratio_given; 1512 info->fill_ratio_needed = A->info.fill_ratio_needed; 1513 info->factor_mallocs = A->info.factor_mallocs; 1514 } else { 1515 info->fill_ratio_given = 0; 1516 info->fill_ratio_needed = 0; 1517 info->factor_mallocs = 0; 1518 } 1519 PetscFunctionReturn(PETSC_SUCCESS); 1520 } 1521 1522 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1523 { 1524 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1525 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1526 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1527 MatScalar *vp, value; 1528 #if defined(PETSC_HAVE_CUPM) 1529 PetscBool inserted = PETSC_FALSE; 1530 PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight; 1531 #endif 1532 1533 PetscFunctionBegin; 1534 for (k = 0; k < m; k++) { /* loop over added rows */ 1535 row = im[k]; 1536 if (row < 0) continue; 1537 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1538 shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */ 1539 cp = a->colidx + shift; /* pointer to the row */ 1540 vp = a->val + shift; /* pointer to the row */ 1541 nrow = a->rlen[row]; 1542 low = 0; 1543 high = nrow; 1544 1545 for (l = 0; l < n; l++) { /* loop over added columns */ 1546 col = in[l]; 1547 if (col < 0) continue; 1548 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1549 if (a->roworiented) { 1550 value = v[l + k * n]; 1551 } else { 1552 value = v[k + l * m]; 1553 } 1554 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1555 1556 /* search in this row for the specified column, i indicates the column to be set */ 1557 if (col <= lastcol) low = 0; 1558 else high = nrow; 1559 lastcol = col; 1560 while (high - low > 5) { 1561 t = (low + high) / 2; 1562 if (*(cp + a->sliceheight * t) > col) high = t; 1563 else low = t; 1564 } 1565 for (i = low; i < high; i++) { 1566 if (*(cp + a->sliceheight * i) > col) break; 1567 if (*(cp + a->sliceheight * i) == col) { 1568 if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value; 1569 else *(vp + a->sliceheight * i) = value; 1570 #if defined(PETSC_HAVE_CUPM) 1571 inserted = PETSC_TRUE; 1572 #endif 1573 low = i + 1; 1574 goto noinsert; 1575 } 1576 } 1577 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1578 if (nonew == 1) goto noinsert; 1579 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1580 #if defined(PETSC_HAVE_CUPM) 1581 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul); 1582 #else 1583 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1584 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1); 1585 #endif 1586 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1587 for (ii = nrow - 1; ii >= i; ii--) { 1588 *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); 1589 *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); 1590 } 1591 a->rlen[row]++; 1592 *(cp + a->sliceheight * i) = col; 1593 *(vp + a->sliceheight * i) = value; 1594 a->nz++; 1595 #if defined(PETSC_HAVE_CUPM) 1596 inserted = PETSC_TRUE; 1597 #endif 1598 low = i + 1; 1599 high++; 1600 nrow++; 1601 noinsert:; 1602 } 1603 a->rlen[row] = nrow; 1604 } 1605 #if defined(PETSC_HAVE_CUPM) 1606 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU; 1607 #endif 1608 PetscFunctionReturn(PETSC_SUCCESS); 1609 } 1610 1611 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1612 { 1613 PetscFunctionBegin; 1614 /* If the two matrices have the same copy implementation, use fast copy. */ 1615 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1616 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1617 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1618 1619 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1620 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1621 } else { 1622 PetscCall(MatCopy_Basic(A, B, str)); 1623 } 1624 PetscFunctionReturn(PETSC_SUCCESS); 1625 } 1626 1627 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1628 { 1629 PetscFunctionBegin; 1630 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1631 PetscFunctionReturn(PETSC_SUCCESS); 1632 } 1633 1634 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1635 { 1636 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1637 1638 PetscFunctionBegin; 1639 *array = a->val; 1640 PetscFunctionReturn(PETSC_SUCCESS); 1641 } 1642 1643 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1644 { 1645 PetscFunctionBegin; 1646 PetscFunctionReturn(PETSC_SUCCESS); 1647 } 1648 1649 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1650 { 1651 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1652 MatScalar *aval = a->val; 1653 PetscScalar oalpha = alpha; 1654 PetscBLASInt one = 1, size; 1655 1656 PetscFunctionBegin; 1657 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1658 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1659 PetscCall(PetscLogFlops(a->nz)); 1660 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1661 #if defined(PETSC_HAVE_CUPM) 1662 if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU; 1663 #endif 1664 PetscFunctionReturn(PETSC_SUCCESS); 1665 } 1666 1667 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1668 { 1669 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1670 1671 PetscFunctionBegin; 1672 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1673 PetscCall(MatShift_Basic(Y, a)); 1674 PetscFunctionReturn(PETSC_SUCCESS); 1675 } 1676 1677 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1678 { 1679 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1680 PetscScalar *x, sum, *t; 1681 const MatScalar *idiag = NULL, *mdiag; 1682 const PetscScalar *b, *xb; 1683 PetscInt n, m = A->rmap->n, i, j, shift; 1684 const PetscInt *diag; 1685 1686 PetscFunctionBegin; 1687 its = its * lits; 1688 1689 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1690 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1691 a->fshift = fshift; 1692 a->omega = omega; 1693 1694 diag = a->diag; 1695 t = a->ssor_work; 1696 idiag = a->idiag; 1697 mdiag = a->mdiag; 1698 1699 PetscCall(VecGetArray(xx, &x)); 1700 PetscCall(VecGetArrayRead(bb, &b)); 1701 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1702 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1703 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1704 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1705 1706 if (flag & SOR_ZERO_INITIAL_GUESS) { 1707 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1708 for (i = 0; i < m; i++) { 1709 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1710 sum = b[i]; 1711 n = (diag[i] - shift) / a->sliceheight; 1712 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1713 t[i] = sum; 1714 x[i] = sum * idiag[i]; 1715 } 1716 xb = t; 1717 PetscCall(PetscLogFlops(a->nz)); 1718 } else xb = b; 1719 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1720 for (i = m - 1; i >= 0; i--) { 1721 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1722 sum = xb[i]; 1723 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1724 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1725 if (xb == b) { 1726 x[i] = sum * idiag[i]; 1727 } else { 1728 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1729 } 1730 } 1731 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1732 } 1733 its--; 1734 } 1735 while (its--) { 1736 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1737 for (i = 0; i < m; i++) { 1738 /* lower */ 1739 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1740 sum = b[i]; 1741 n = (diag[i] - shift) / a->sliceheight; 1742 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1743 t[i] = sum; /* save application of the lower-triangular part */ 1744 /* upper */ 1745 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1746 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1747 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1748 } 1749 xb = t; 1750 PetscCall(PetscLogFlops(2.0 * a->nz)); 1751 } else xb = b; 1752 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1753 for (i = m - 1; i >= 0; i--) { 1754 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1755 sum = xb[i]; 1756 if (xb == b) { 1757 /* whole matrix (no checkpointing available) */ 1758 n = a->rlen[i]; 1759 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1760 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1761 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1762 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1763 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1764 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1765 } 1766 } 1767 if (xb == b) { 1768 PetscCall(PetscLogFlops(2.0 * a->nz)); 1769 } else { 1770 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1771 } 1772 } 1773 } 1774 PetscCall(VecRestoreArray(xx, &x)); 1775 PetscCall(VecRestoreArrayRead(bb, &b)); 1776 PetscFunctionReturn(PETSC_SUCCESS); 1777 } 1778 1779 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1780 MatGetRow_SeqSELL, 1781 MatRestoreRow_SeqSELL, 1782 MatMult_SeqSELL, 1783 /* 4*/ MatMultAdd_SeqSELL, 1784 MatMultTranspose_SeqSELL, 1785 MatMultTransposeAdd_SeqSELL, 1786 NULL, 1787 NULL, 1788 NULL, 1789 /* 10*/ NULL, 1790 NULL, 1791 NULL, 1792 MatSOR_SeqSELL, 1793 NULL, 1794 /* 15*/ MatGetInfo_SeqSELL, 1795 MatEqual_SeqSELL, 1796 MatGetDiagonal_SeqSELL, 1797 MatDiagonalScale_SeqSELL, 1798 NULL, 1799 /* 20*/ NULL, 1800 MatAssemblyEnd_SeqSELL, 1801 MatSetOption_SeqSELL, 1802 MatZeroEntries_SeqSELL, 1803 /* 24*/ NULL, 1804 NULL, 1805 NULL, 1806 NULL, 1807 NULL, 1808 /* 29*/ MatSetUp_SeqSELL, 1809 NULL, 1810 NULL, 1811 NULL, 1812 NULL, 1813 /* 34*/ MatDuplicate_SeqSELL, 1814 NULL, 1815 NULL, 1816 NULL, 1817 NULL, 1818 /* 39*/ NULL, 1819 NULL, 1820 NULL, 1821 MatGetValues_SeqSELL, 1822 MatCopy_SeqSELL, 1823 /* 44*/ NULL, 1824 MatScale_SeqSELL, 1825 MatShift_SeqSELL, 1826 NULL, 1827 NULL, 1828 /* 49*/ NULL, 1829 NULL, 1830 NULL, 1831 NULL, 1832 NULL, 1833 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1834 NULL, 1835 NULL, 1836 NULL, 1837 NULL, 1838 /* 59*/ NULL, 1839 MatDestroy_SeqSELL, 1840 MatView_SeqSELL, 1841 NULL, 1842 NULL, 1843 /* 64*/ NULL, 1844 NULL, 1845 NULL, 1846 NULL, 1847 NULL, 1848 /* 69*/ NULL, 1849 NULL, 1850 NULL, 1851 NULL, 1852 NULL, 1853 /* 74*/ NULL, 1854 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1855 NULL, 1856 NULL, 1857 NULL, 1858 /* 79*/ NULL, 1859 NULL, 1860 NULL, 1861 NULL, 1862 NULL, 1863 /* 84*/ NULL, 1864 NULL, 1865 NULL, 1866 NULL, 1867 NULL, 1868 /* 89*/ NULL, 1869 NULL, 1870 NULL, 1871 NULL, 1872 NULL, 1873 /* 94*/ NULL, 1874 NULL, 1875 NULL, 1876 NULL, 1877 NULL, 1878 /* 99*/ NULL, 1879 NULL, 1880 NULL, 1881 MatConjugate_SeqSELL, 1882 NULL, 1883 /*104*/ NULL, 1884 NULL, 1885 NULL, 1886 NULL, 1887 NULL, 1888 /*109*/ NULL, 1889 NULL, 1890 NULL, 1891 NULL, 1892 MatMissingDiagonal_SeqSELL, 1893 /*114*/ NULL, 1894 NULL, 1895 NULL, 1896 NULL, 1897 NULL, 1898 /*119*/ NULL, 1899 NULL, 1900 NULL, 1901 NULL, 1902 NULL, 1903 /*124*/ NULL, 1904 NULL, 1905 NULL, 1906 NULL, 1907 NULL, 1908 /*129*/ NULL, 1909 NULL, 1910 NULL, 1911 NULL, 1912 NULL, 1913 /*134*/ NULL, 1914 NULL, 1915 NULL, 1916 NULL, 1917 NULL, 1918 /*139*/ NULL, 1919 NULL, 1920 NULL, 1921 MatFDColoringSetUp_SeqXAIJ, 1922 NULL, 1923 /*144*/ NULL, 1924 NULL, 1925 NULL, 1926 NULL, 1927 NULL, 1928 NULL, 1929 /*150*/ NULL, 1930 NULL, 1931 NULL, 1932 NULL, 1933 NULL, 1934 /*155*/ NULL, 1935 NULL}; 1936 1937 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1938 { 1939 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1940 1941 PetscFunctionBegin; 1942 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1943 1944 /* allocate space for values if not already there */ 1945 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1946 1947 /* copy values over */ 1948 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1949 PetscFunctionReturn(PETSC_SUCCESS); 1950 } 1951 1952 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1953 { 1954 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1955 1956 PetscFunctionBegin; 1957 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1958 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1959 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1960 PetscFunctionReturn(PETSC_SUCCESS); 1961 } 1962 1963 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio) 1964 { 1965 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1966 1967 PetscFunctionBegin; 1968 if (a->totalslices && a->sliidx[a->totalslices]) { 1969 *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices]; 1970 } else { 1971 *ratio = 0.0; 1972 } 1973 PetscFunctionReturn(PETSC_SUCCESS); 1974 } 1975 1976 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth) 1977 { 1978 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1979 PetscInt i, current_slicewidth; 1980 1981 PetscFunctionBegin; 1982 *slicewidth = 0; 1983 for (i = 0; i < a->totalslices; i++) { 1984 current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight; 1985 if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth; 1986 } 1987 PetscFunctionReturn(PETSC_SUCCESS); 1988 } 1989 1990 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth) 1991 { 1992 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1993 1994 PetscFunctionBegin; 1995 *slicewidth = 0; 1996 if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; } 1997 PetscFunctionReturn(PETSC_SUCCESS); 1998 } 1999 2000 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance) 2001 { 2002 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2003 PetscReal mean; 2004 PetscInt i, totalslices = a->totalslices, *sliidx = a->sliidx; 2005 2006 PetscFunctionBegin; 2007 *variance = 0; 2008 if (totalslices) { 2009 mean = (PetscReal)sliidx[totalslices] / totalslices; 2010 for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; } 2011 } 2012 PetscFunctionReturn(PETSC_SUCCESS); 2013 } 2014 2015 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight) 2016 { 2017 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2018 2019 PetscFunctionBegin; 2020 if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS); 2021 PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight); 2022 a->sliceheight = sliceheight; 2023 #if defined(PETSC_HAVE_CUPM) 2024 PetscCheck(PetscMax(DEVICE_MEM_ALIGN, sliceheight) % PetscMin(DEVICE_MEM_ALIGN, sliceheight) == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The slice height is not compatible with DEVICE_MEM_ALIGN (one must be divisible by the other) %" PetscInt_FMT, sliceheight); 2025 #endif 2026 PetscFunctionReturn(PETSC_SUCCESS); 2027 } 2028 2029 /*@ 2030 MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix. 2031 2032 Not Collective 2033 2034 Input Parameter: 2035 . A - a MATSEQSELL matrix 2036 2037 Output Parameter: 2038 . ratio - ratio of number of padded zeros to number of allocated elements 2039 2040 Level: intermediate 2041 2042 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2043 @*/ 2044 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio) 2045 { 2046 PetscFunctionBegin; 2047 PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio)); 2048 PetscFunctionReturn(PETSC_SUCCESS); 2049 } 2050 2051 /*@ 2052 MatSeqSELLGetMaxSliceWidth - returns the maximum slice width. 2053 2054 Not Collective 2055 2056 Input Parameter: 2057 . A - a MATSEQSELL matrix 2058 2059 Output Parameter: 2060 . slicewidth - maximum slice width 2061 2062 Level: intermediate 2063 2064 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2065 @*/ 2066 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth) 2067 { 2068 PetscFunctionBegin; 2069 PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth)); 2070 PetscFunctionReturn(PETSC_SUCCESS); 2071 } 2072 2073 /*@ 2074 MatSeqSELLGetAvgSliceWidth - returns the average slice width. 2075 2076 Not Collective 2077 2078 Input Parameter: 2079 . A - a MATSEQSELL matrix 2080 2081 Output Parameter: 2082 . slicewidth - average slice width 2083 2084 Level: intermediate 2085 2086 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()` 2087 @*/ 2088 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth) 2089 { 2090 PetscFunctionBegin; 2091 PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth)); 2092 PetscFunctionReturn(PETSC_SUCCESS); 2093 } 2094 2095 /*@ 2096 MatSeqSELLSetSliceHeight - sets the slice height. 2097 2098 Not Collective 2099 2100 Input Parameters: 2101 + A - a MATSEQSELL matrix 2102 - sliceheight - slice height 2103 2104 Notes: 2105 You cannot change the slice height once it have been set. 2106 2107 The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called. 2108 2109 Level: intermediate 2110 2111 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()` 2112 @*/ 2113 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight) 2114 { 2115 PetscFunctionBegin; 2116 PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight)); 2117 PetscFunctionReturn(PETSC_SUCCESS); 2118 } 2119 2120 /*@ 2121 MatSeqSELLGetVarSliceSize - returns the variance of the slice size. 2122 2123 Not Collective 2124 2125 Input Parameter: 2126 . A - a MATSEQSELL matrix 2127 2128 Output Parameter: 2129 . variance - variance of the slice size 2130 2131 Level: intermediate 2132 2133 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()` 2134 @*/ 2135 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance) 2136 { 2137 PetscFunctionBegin; 2138 PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance)); 2139 PetscFunctionReturn(PETSC_SUCCESS); 2140 } 2141 2142 #if defined(PETSC_HAVE_CUDA) 2143 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 2144 #endif 2145 #if defined(PETSC_HAVE_HIP) 2146 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat); 2147 #endif 2148 2149 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 2150 { 2151 Mat_SeqSELL *b; 2152 PetscMPIInt size; 2153 2154 PetscFunctionBegin; 2155 PetscCall(PetscCitationsRegister(citation, &cited)); 2156 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 2157 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 2158 2159 PetscCall(PetscNew(&b)); 2160 2161 B->data = (void *)b; 2162 B->ops[0] = MatOps_Values; 2163 2164 b->row = NULL; 2165 b->col = NULL; 2166 b->icol = NULL; 2167 b->reallocs = 0; 2168 b->ignorezeroentries = PETSC_FALSE; 2169 b->roworiented = PETSC_TRUE; 2170 b->nonew = 0; 2171 b->diag = NULL; 2172 b->solve_work = NULL; 2173 B->spptr = NULL; 2174 b->saved_values = NULL; 2175 b->idiag = NULL; 2176 b->mdiag = NULL; 2177 b->ssor_work = NULL; 2178 b->omega = 1.0; 2179 b->fshift = 0.0; 2180 b->idiagvalid = PETSC_FALSE; 2181 b->keepnonzeropattern = PETSC_FALSE; 2182 b->sliceheight = 0; 2183 2184 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 2185 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 2186 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 2187 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 2188 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 2189 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 2190 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 2191 #if defined(PETSC_HAVE_CUDA) 2192 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA)); 2193 #endif 2194 #if defined(PETSC_HAVE_HIP) 2195 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellhip_C", MatConvert_SeqSELL_SeqSELLHIP)); 2196 #endif 2197 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL)); 2198 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL)); 2199 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL)); 2200 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL)); 2201 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL)); 2202 2203 PetscObjectOptionsBegin((PetscObject)B); 2204 { 2205 PetscInt newsh = -1; 2206 PetscBool flg; 2207 #if defined(PETSC_HAVE_CUPM) 2208 PetscInt chunksize = 0; 2209 #endif 2210 2211 PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg)); 2212 if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); } 2213 #if defined(PETSC_HAVE_CUPM) 2214 PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA/HIP kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg)); 2215 if (flg) { 2216 PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize); 2217 b->chunksize = chunksize; 2218 } 2219 #endif 2220 } 2221 PetscOptionsEnd(); 2222 PetscFunctionReturn(PETSC_SUCCESS); 2223 } 2224 2225 /* 2226 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 2227 */ 2228 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 2229 { 2230 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 2231 PetscInt i, m = A->rmap->n; 2232 PetscInt totalslices = a->totalslices; 2233 2234 PetscFunctionBegin; 2235 C->factortype = A->factortype; 2236 c->row = NULL; 2237 c->col = NULL; 2238 c->icol = NULL; 2239 c->reallocs = 0; 2240 C->assembled = PETSC_TRUE; 2241 2242 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2243 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2244 2245 c->sliceheight = a->sliceheight; 2246 PetscCall(PetscMalloc1(c->sliceheight * totalslices, &c->rlen)); 2247 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2248 2249 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2250 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2251 2252 /* allocate the matrix space */ 2253 if (mallocmatspace) { 2254 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2255 2256 c->singlemalloc = PETSC_TRUE; 2257 2258 if (m > 0) { 2259 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2260 if (cpvalues == MAT_COPY_VALUES) { 2261 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2262 } else { 2263 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2264 } 2265 } 2266 } 2267 2268 c->ignorezeroentries = a->ignorezeroentries; 2269 c->roworiented = a->roworiented; 2270 c->nonew = a->nonew; 2271 if (a->diag) { 2272 PetscCall(PetscMalloc1(m, &c->diag)); 2273 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2274 } else c->diag = NULL; 2275 2276 c->solve_work = NULL; 2277 c->saved_values = NULL; 2278 c->idiag = NULL; 2279 c->ssor_work = NULL; 2280 c->keepnonzeropattern = a->keepnonzeropattern; 2281 c->free_val = PETSC_TRUE; 2282 c->free_colidx = PETSC_TRUE; 2283 2284 c->maxallocmat = a->maxallocmat; 2285 c->maxallocrow = a->maxallocrow; 2286 c->rlenmax = a->rlenmax; 2287 c->nz = a->nz; 2288 C->preallocated = PETSC_TRUE; 2289 2290 c->nonzerorowcnt = a->nonzerorowcnt; 2291 C->nonzerostate = A->nonzerostate; 2292 2293 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2294 PetscFunctionReturn(PETSC_SUCCESS); 2295 } 2296 2297 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2298 { 2299 PetscFunctionBegin; 2300 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2301 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2302 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2303 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2304 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2305 PetscFunctionReturn(PETSC_SUCCESS); 2306 } 2307 2308 /*MC 2309 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2310 based on the sliced Ellpack format, {cite}`zhangellpack2018` 2311 2312 Options Database Key: 2313 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2314 2315 Level: beginner 2316 2317 .seealso: `Mat`, `MatCreateSeqSELL()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2318 M*/ 2319 2320 /*MC 2321 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018` 2322 2323 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2324 and `MATMPISELL` otherwise. As a result, for single process communicators, 2325 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2326 for communicators controlling multiple processes. It is recommended that you call both of 2327 the above preallocation routines for simplicity. 2328 2329 Options Database Key: 2330 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2331 2332 Level: beginner 2333 2334 Notes: 2335 This format is only supported for real scalars, double precision, and 32-bit indices (the defaults). 2336 2337 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2338 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2339 2340 Developer Notes: 2341 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2342 2343 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2344 .vb 2345 (2 0 3 4) 2346 Consider the matrix A = (5 0 6 0) 2347 (0 0 7 8) 2348 (0 0 9 9) 2349 2350 symbolically the Ellpack format can be written as 2351 2352 (2 3 4 |) (0 2 3 |) 2353 v = (5 6 0 |) colidx = (0 2 2 |) 2354 -------- --------- 2355 (7 8 |) (2 3 |) 2356 (9 9 |) (2 3 |) 2357 2358 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2359 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2360 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2361 2362 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2363 2364 .ve 2365 2366 See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance. 2367 2368 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSELL()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2369 M*/ 2370 2371 /*@ 2372 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2373 2374 Collective 2375 2376 Input Parameters: 2377 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2378 . m - number of rows 2379 . n - number of columns 2380 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided 2381 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL 2382 2383 Output Parameter: 2384 . A - the matrix 2385 2386 Level: intermediate 2387 2388 Notes: 2389 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2390 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2391 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2392 2393 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 2394 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 2395 allocation. 2396 2397 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL` 2398 @*/ 2399 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A) 2400 { 2401 PetscFunctionBegin; 2402 PetscCall(MatCreate(comm, A)); 2403 PetscCall(MatSetSizes(*A, m, n, m, n)); 2404 PetscCall(MatSetType(*A, MATSEQSELL)); 2405 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen)); 2406 PetscFunctionReturn(PETSC_SUCCESS); 2407 } 2408 2409 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2410 { 2411 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2412 PetscInt totalslices = a->totalslices; 2413 2414 PetscFunctionBegin; 2415 /* If the matrix dimensions are not equal,or no of nonzeros */ 2416 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2417 *flg = PETSC_FALSE; 2418 PetscFunctionReturn(PETSC_SUCCESS); 2419 } 2420 /* if the a->colidx are the same */ 2421 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2422 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2423 /* if a->val are the same */ 2424 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2425 PetscFunctionReturn(PETSC_SUCCESS); 2426 } 2427 2428 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2429 { 2430 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2431 2432 PetscFunctionBegin; 2433 a->idiagvalid = PETSC_FALSE; 2434 PetscFunctionReturn(PETSC_SUCCESS); 2435 } 2436 2437 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2438 { 2439 #if defined(PETSC_USE_COMPLEX) 2440 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2441 PetscInt i; 2442 PetscScalar *val = a->val; 2443 2444 PetscFunctionBegin; 2445 for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); } 2446 #if defined(PETSC_HAVE_CUPM) 2447 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 2448 #endif 2449 #else 2450 PetscFunctionBegin; 2451 #endif 2452 PetscFunctionReturn(PETSC_SUCCESS); 2453 } 2454