1 /* 2 Defines the basic matrix operations for the SELL matrix storage format. 3 */ 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 #include <petscblaslapack.h> 6 #include <petsc/private/kernels/blocktranspose.h> 7 8 static PetscBool cited = PETSC_FALSE; 9 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 10 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 11 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 12 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 13 " year = 2018\n" 14 "}\n"; 15 16 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 17 18 #include <immintrin.h> 19 20 #if !defined(_MM_SCALE_8) 21 #define _MM_SCALE_8 8 22 #endif 23 24 #if defined(__AVX512F__) 25 /* these do not work 26 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 27 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 28 */ 29 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 30 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 31 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 32 vec_vals = _mm512_loadu_pd(aval); \ 33 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 34 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 35 #elif defined(__AVX2__) && defined(__FMA__) 36 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 37 vec_vals = _mm256_loadu_pd(aval); \ 38 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 39 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 40 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 41 #endif 42 #endif /* PETSC_HAVE_IMMINTRIN_H */ 43 44 /*@C 45 MatSeqSELLSetPreallocation - For good matrix assembly performance 46 the user should preallocate the matrix storage by setting the parameter `nz` 47 (or the array `nnz`). 48 49 Collective 50 51 Input Parameters: 52 + B - The `MATSEQSELL` matrix 53 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided 54 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 55 56 Level: intermediate 57 58 Notes: 59 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 60 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 61 allocation. 62 63 You can call `MatGetInfo()` to get information on how effective the preallocation was; 64 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 65 You can also run with the option `-info` and look for messages with the string 66 malloc in them to see if additional memory allocation was needed. 67 68 Developer Notes: 69 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 70 entries or columns indices. 71 72 The maximum number of nonzeos in any row should be as accurate as possible. 73 If it is underestimated, you will get bad performance due to reallocation 74 (`MatSeqXSELLReallocateSELL()`). 75 76 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 77 @*/ 78 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 79 { 80 PetscFunctionBegin; 81 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 82 PetscValidType(B, 1); 83 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 84 PetscFunctionReturn(PETSC_SUCCESS); 85 } 86 87 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 88 { 89 Mat_SeqSELL *b; 90 PetscInt i, j, totalslices; 91 #if defined(PETSC_HAVE_CUDA) 92 PetscInt rlenmax = 0; 93 #endif 94 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 95 96 PetscFunctionBegin; 97 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 98 if (maxallocrow == MAT_SKIP_ALLOCATION) { 99 skipallocation = PETSC_TRUE; 100 maxallocrow = 0; 101 } 102 103 PetscCall(PetscLayoutSetUp(B->rmap)); 104 PetscCall(PetscLayoutSetUp(B->cmap)); 105 106 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 107 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 108 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 109 if (rlen) { 110 for (i = 0; i < B->rmap->n; i++) { 111 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 112 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 113 } 114 } 115 116 B->preallocated = PETSC_TRUE; 117 118 b = (Mat_SeqSELL *)B->data; 119 120 if (!b->sliceheight) { /* not set yet */ 121 #if defined(PETSC_HAVE_CUDA) 122 b->sliceheight = 16; 123 #else 124 b->sliceheight = 8; 125 #endif 126 } 127 totalslices = PetscCeilInt(B->rmap->n, b->sliceheight); 128 b->totalslices = totalslices; 129 if (!skipallocation) { 130 if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n)); 131 132 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 133 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 134 } 135 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 136 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 137 else if (maxallocrow < 0) maxallocrow = 1; 138 #if defined(PETSC_HAVE_CUDA) 139 rlenmax = maxallocrow; 140 /* Pad the slice to DEVICE_MEM_ALIGN */ 141 while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++; 142 #endif 143 for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow; 144 } else { 145 #if defined(PETSC_HAVE_CUDA) 146 PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight; 147 #endif 148 maxallocrow = 0; 149 b->sliidx[0] = 0; 150 for (i = 1; i < totalslices; i++) { 151 b->sliidx[i] = 0; 152 for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); } 153 #if defined(PETSC_HAVE_CUDA) 154 rlenmax = PetscMax(b->sliidx[i], rlenmax); 155 /* Pad the slice to DEVICE_MEM_ALIGN */ 156 b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul; 157 #endif 158 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 159 PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i])); 160 } 161 /* last slice */ 162 b->sliidx[totalslices] = 0; 163 for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 164 #if defined(PETSC_HAVE_CUDA) 165 rlenmax = PetscMax(b->sliidx[i], rlenmax); 166 b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul; 167 #endif 168 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 169 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices]; 170 } 171 172 /* allocate space for val, colidx, rlen */ 173 /* FIXME: should B's old memory be unlogged? */ 174 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 175 /* FIXME: assuming an element of the bit array takes 8 bits */ 176 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 177 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 178 PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen)); 179 180 b->singlemalloc = PETSC_TRUE; 181 b->free_val = PETSC_TRUE; 182 b->free_colidx = PETSC_TRUE; 183 } else { 184 b->free_val = PETSC_FALSE; 185 b->free_colidx = PETSC_FALSE; 186 } 187 188 b->nz = 0; 189 b->maxallocrow = maxallocrow; 190 #if defined(PETSC_HAVE_CUDA) 191 b->rlenmax = rlenmax; 192 #else 193 b->rlenmax = maxallocrow; 194 #endif 195 b->maxallocmat = b->sliidx[totalslices]; 196 B->info.nz_unneeded = (double)b->maxallocmat; 197 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 198 PetscFunctionReturn(PETSC_SUCCESS); 199 } 200 201 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 202 { 203 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 204 PetscInt shift; 205 206 PetscFunctionBegin; 207 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 208 if (nz) *nz = a->rlen[row]; 209 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); 210 if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); } 211 if (idx) { 212 PetscInt j; 213 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j]; 214 *idx = a->getrowcols; 215 } 216 if (v) { 217 PetscInt j; 218 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j]; 219 *v = a->getrowvals; 220 } 221 PetscFunctionReturn(PETSC_SUCCESS); 222 } 223 224 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 225 { 226 PetscFunctionBegin; 227 PetscFunctionReturn(PETSC_SUCCESS); 228 } 229 230 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 231 { 232 Mat B; 233 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 234 PetscInt i; 235 236 PetscFunctionBegin; 237 if (reuse == MAT_REUSE_MATRIX) { 238 B = *newmat; 239 PetscCall(MatZeroEntries(B)); 240 } else { 241 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 242 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 243 PetscCall(MatSetType(B, MATSEQAIJ)); 244 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 245 } 246 247 for (i = 0; i < A->rmap->n; i++) { 248 PetscInt nz = 0, *cols = NULL; 249 PetscScalar *vals = NULL; 250 251 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 252 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 253 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 254 } 255 256 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 257 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 258 B->rmap->bs = A->rmap->bs; 259 260 if (reuse == MAT_INPLACE_MATRIX) { 261 PetscCall(MatHeaderReplace(A, &B)); 262 } else { 263 *newmat = B; 264 } 265 PetscFunctionReturn(PETSC_SUCCESS); 266 } 267 268 #include <../src/mat/impls/aij/seq/aij.h> 269 270 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 271 { 272 Mat B; 273 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 274 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 275 const PetscInt *cols; 276 const PetscScalar *vals; 277 278 PetscFunctionBegin; 279 280 if (reuse == MAT_REUSE_MATRIX) { 281 B = *newmat; 282 } else { 283 if (PetscDefined(USE_DEBUG) || !a->ilen) { 284 PetscCall(PetscMalloc1(m, &rowlengths)); 285 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 286 } 287 if (PetscDefined(USE_DEBUG) && a->ilen) { 288 PetscBool eq; 289 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 290 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 291 PetscCall(PetscFree(rowlengths)); 292 rowlengths = a->ilen; 293 } else if (a->ilen) rowlengths = a->ilen; 294 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 295 PetscCall(MatSetSizes(B, m, n, m, n)); 296 PetscCall(MatSetType(B, MATSEQSELL)); 297 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 298 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 299 } 300 301 for (row = 0; row < m; row++) { 302 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 303 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 304 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 305 } 306 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 307 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 308 B->rmap->bs = A->rmap->bs; 309 310 if (reuse == MAT_INPLACE_MATRIX) { 311 PetscCall(MatHeaderReplace(A, &B)); 312 } else { 313 *newmat = B; 314 } 315 PetscFunctionReturn(PETSC_SUCCESS); 316 } 317 318 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 319 { 320 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 321 PetscScalar *y; 322 const PetscScalar *x; 323 const MatScalar *aval = a->val; 324 PetscInt totalslices = a->totalslices; 325 const PetscInt *acolidx = a->colidx; 326 PetscInt i, j; 327 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 328 __m512d vec_x, vec_y, vec_vals; 329 __m256i vec_idx; 330 __mmask8 mask; 331 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 332 __m256i vec_idx2, vec_idx3, vec_idx4; 333 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 334 __m128i vec_idx; 335 __m256d vec_x, vec_y, vec_y2, vec_vals; 336 MatScalar yval; 337 PetscInt r, rows_left, row, nnz_in_row; 338 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 339 __m128d vec_x_tmp; 340 __m256d vec_x, vec_y, vec_y2, vec_vals; 341 MatScalar yval; 342 PetscInt r, rows_left, row, nnz_in_row; 343 #else 344 PetscInt k, sliceheight = a->sliceheight; 345 PetscScalar *sum; 346 #endif 347 348 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 349 #pragma disjoint(*x, *y, *aval) 350 #endif 351 352 PetscFunctionBegin; 353 PetscCall(VecGetArrayRead(xx, &x)); 354 PetscCall(VecGetArray(yy, &y)); 355 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 356 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 357 for (i = 0; i < totalslices; i++) { /* loop over slices */ 358 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 359 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 360 361 vec_y = _mm512_setzero_pd(); 362 vec_y2 = _mm512_setzero_pd(); 363 vec_y3 = _mm512_setzero_pd(); 364 vec_y4 = _mm512_setzero_pd(); 365 366 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 367 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 368 case 3: 369 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 370 acolidx += 8; 371 aval += 8; 372 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 373 acolidx += 8; 374 aval += 8; 375 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 376 acolidx += 8; 377 aval += 8; 378 j += 3; 379 break; 380 case 2: 381 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 382 acolidx += 8; 383 aval += 8; 384 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 385 acolidx += 8; 386 aval += 8; 387 j += 2; 388 break; 389 case 1: 390 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 391 acolidx += 8; 392 aval += 8; 393 j += 1; 394 break; 395 } 396 #pragma novector 397 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 398 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 399 acolidx += 8; 400 aval += 8; 401 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 402 acolidx += 8; 403 aval += 8; 404 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 405 acolidx += 8; 406 aval += 8; 407 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 408 acolidx += 8; 409 aval += 8; 410 } 411 412 vec_y = _mm512_add_pd(vec_y, vec_y2); 413 vec_y = _mm512_add_pd(vec_y, vec_y3); 414 vec_y = _mm512_add_pd(vec_y, vec_y4); 415 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 416 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 417 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 418 } else { 419 _mm512_storeu_pd(&y[8 * i], vec_y); 420 } 421 } 422 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 423 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 424 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 425 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 426 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 427 428 /* last slice may have padding rows. Don't use vectorization. */ 429 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 430 rows_left = A->rmap->n - 8 * i; 431 for (r = 0; r < rows_left; ++r) { 432 yval = (MatScalar)0; 433 row = 8 * i + r; 434 nnz_in_row = a->rlen[row]; 435 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 436 y[row] = yval; 437 } 438 break; 439 } 440 441 vec_y = _mm256_setzero_pd(); 442 vec_y2 = _mm256_setzero_pd(); 443 444 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 445 #pragma novector 446 #pragma unroll(2) 447 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 448 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 449 aval += 4; 450 acolidx += 4; 451 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 452 aval += 4; 453 acolidx += 4; 454 } 455 456 _mm256_storeu_pd(y + i * 8, vec_y); 457 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 458 } 459 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 460 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 461 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 462 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 463 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 464 465 vec_y = _mm256_setzero_pd(); 466 vec_y2 = _mm256_setzero_pd(); 467 468 /* last slice may have padding rows. Don't use vectorization. */ 469 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 470 rows_left = A->rmap->n - 8 * i; 471 for (r = 0; r < rows_left; ++r) { 472 yval = (MatScalar)0; 473 row = 8 * i + r; 474 nnz_in_row = a->rlen[row]; 475 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 476 y[row] = yval; 477 } 478 break; 479 } 480 481 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 482 #pragma novector 483 #pragma unroll(2) 484 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 485 vec_vals = _mm256_loadu_pd(aval); 486 vec_x_tmp = _mm_setzero_pd(); 487 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 488 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 489 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 490 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 491 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 492 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 493 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 494 aval += 4; 495 496 vec_vals = _mm256_loadu_pd(aval); 497 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 498 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 499 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 500 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 501 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 502 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 503 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 504 aval += 4; 505 } 506 507 _mm256_storeu_pd(y + i * 8, vec_y); 508 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 509 } 510 #else 511 PetscCall(PetscMalloc1(sliceheight, &sum)); 512 for (i = 0; i < totalslices; i++) { /* loop over slices */ 513 for (j = 0; j < sliceheight; j++) { 514 sum[j] = 0.0; 515 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 516 } 517 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */ 518 for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j]; 519 } else { 520 for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j]; 521 } 522 } 523 PetscCall(PetscFree(sum)); 524 #endif 525 526 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 527 PetscCall(VecRestoreArrayRead(xx, &x)); 528 PetscCall(VecRestoreArray(yy, &y)); 529 PetscFunctionReturn(PETSC_SUCCESS); 530 } 531 532 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 533 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 534 { 535 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 536 PetscScalar *y, *z; 537 const PetscScalar *x; 538 const MatScalar *aval = a->val; 539 PetscInt totalslices = a->totalslices; 540 const PetscInt *acolidx = a->colidx; 541 PetscInt i, j; 542 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 543 __m512d vec_x, vec_y, vec_vals; 544 __m256i vec_idx; 545 __mmask8 mask = 0; 546 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 547 __m256i vec_idx2, vec_idx3, vec_idx4; 548 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 549 __m128d vec_x_tmp; 550 __m256d vec_x, vec_y, vec_y2, vec_vals; 551 MatScalar yval; 552 PetscInt r, row, nnz_in_row; 553 #else 554 PetscInt k, sliceheight = a->sliceheight; 555 PetscScalar *sum; 556 #endif 557 558 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 559 #pragma disjoint(*x, *y, *aval) 560 #endif 561 562 PetscFunctionBegin; 563 if (!a->nz) { 564 PetscCall(VecCopy(yy, zz)); 565 PetscFunctionReturn(PETSC_SUCCESS); 566 } 567 PetscCall(VecGetArrayRead(xx, &x)); 568 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 569 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 570 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 571 for (i = 0; i < totalslices; i++) { /* loop over slices */ 572 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 573 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 574 575 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 576 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 577 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 578 } else { 579 vec_y = _mm512_loadu_pd(&y[8 * i]); 580 } 581 vec_y2 = _mm512_setzero_pd(); 582 vec_y3 = _mm512_setzero_pd(); 583 vec_y4 = _mm512_setzero_pd(); 584 585 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 586 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 587 case 3: 588 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 589 acolidx += 8; 590 aval += 8; 591 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 592 acolidx += 8; 593 aval += 8; 594 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 595 acolidx += 8; 596 aval += 8; 597 j += 3; 598 break; 599 case 2: 600 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 601 acolidx += 8; 602 aval += 8; 603 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 604 acolidx += 8; 605 aval += 8; 606 j += 2; 607 break; 608 case 1: 609 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 610 acolidx += 8; 611 aval += 8; 612 j += 1; 613 break; 614 } 615 #pragma novector 616 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 617 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 618 acolidx += 8; 619 aval += 8; 620 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 621 acolidx += 8; 622 aval += 8; 623 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 624 acolidx += 8; 625 aval += 8; 626 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 627 acolidx += 8; 628 aval += 8; 629 } 630 631 vec_y = _mm512_add_pd(vec_y, vec_y2); 632 vec_y = _mm512_add_pd(vec_y, vec_y3); 633 vec_y = _mm512_add_pd(vec_y, vec_y4); 634 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 635 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 636 } else { 637 _mm512_storeu_pd(&z[8 * i], vec_y); 638 } 639 } 640 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 641 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 642 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 643 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 644 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 645 646 /* last slice may have padding rows. Don't use vectorization. */ 647 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 648 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 649 row = 8 * i + r; 650 yval = (MatScalar)0.0; 651 nnz_in_row = a->rlen[row]; 652 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 653 z[row] = y[row] + yval; 654 } 655 break; 656 } 657 658 vec_y = _mm256_loadu_pd(y + 8 * i); 659 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 660 661 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 662 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 663 vec_vals = _mm256_loadu_pd(aval); 664 vec_x_tmp = _mm_setzero_pd(); 665 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 666 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 667 vec_x = _mm256_setzero_pd(); 668 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 669 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 670 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 671 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 672 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 673 aval += 4; 674 675 vec_vals = _mm256_loadu_pd(aval); 676 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 677 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 678 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 679 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 680 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 681 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 682 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 683 aval += 4; 684 } 685 686 _mm256_storeu_pd(z + i * 8, vec_y); 687 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 688 } 689 #else 690 PetscCall(PetscMalloc1(sliceheight, &sum)); 691 for (i = 0; i < totalslices; i++) { /* loop over slices */ 692 for (j = 0; j < sliceheight; j++) { 693 sum[j] = 0.0; 694 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 695 } 696 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 697 for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 698 } else { 699 for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 700 } 701 } 702 PetscCall(PetscFree(sum)); 703 #endif 704 705 PetscCall(PetscLogFlops(2.0 * a->nz)); 706 PetscCall(VecRestoreArrayRead(xx, &x)); 707 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 708 PetscFunctionReturn(PETSC_SUCCESS); 709 } 710 711 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 712 { 713 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 714 PetscScalar *y; 715 const PetscScalar *x; 716 const MatScalar *aval = a->val; 717 const PetscInt *acolidx = a->colidx; 718 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight; 719 720 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 721 #pragma disjoint(*x, *y, *aval) 722 #endif 723 724 PetscFunctionBegin; 725 if (A->symmetric == PETSC_BOOL3_TRUE) { 726 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 727 PetscFunctionReturn(PETSC_SUCCESS); 728 } 729 if (zz != yy) PetscCall(VecCopy(zz, yy)); 730 731 if (a->nz) { 732 PetscCall(VecGetArrayRead(xx, &x)); 733 PetscCall(VecGetArray(yy, &y)); 734 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 735 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 736 for (r = 0; r < (A->rmap->n % sliceheight); ++r) { 737 row = sliceheight * i + r; 738 nnz_in_row = a->rlen[row]; 739 for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row]; 740 } 741 break; 742 } 743 for (r = 0; r < sliceheight; ++r) 744 for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r]; 745 } 746 PetscCall(PetscLogFlops(2.0 * a->nz)); 747 PetscCall(VecRestoreArrayRead(xx, &x)); 748 PetscCall(VecRestoreArray(yy, &y)); 749 } 750 PetscFunctionReturn(PETSC_SUCCESS); 751 } 752 753 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 754 { 755 PetscFunctionBegin; 756 if (A->symmetric == PETSC_BOOL3_TRUE) { 757 PetscCall(MatMult_SeqSELL(A, xx, yy)); 758 } else { 759 PetscCall(VecSet(yy, 0.0)); 760 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 761 } 762 PetscFunctionReturn(PETSC_SUCCESS); 763 } 764 765 /* 766 Checks for missing diagonals 767 */ 768 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 769 { 770 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 771 PetscInt *diag, i; 772 773 PetscFunctionBegin; 774 *missing = PETSC_FALSE; 775 if (A->rmap->n > 0 && !(a->colidx)) { 776 *missing = PETSC_TRUE; 777 if (d) *d = 0; 778 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 779 } else { 780 diag = a->diag; 781 for (i = 0; i < A->rmap->n; i++) { 782 if (diag[i] == -1) { 783 *missing = PETSC_TRUE; 784 if (d) *d = i; 785 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 786 break; 787 } 788 } 789 } 790 PetscFunctionReturn(PETSC_SUCCESS); 791 } 792 793 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 794 { 795 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 796 PetscInt i, j, m = A->rmap->n, shift; 797 798 PetscFunctionBegin; 799 if (!a->diag) { 800 PetscCall(PetscMalloc1(m, &a->diag)); 801 a->free_diag = PETSC_TRUE; 802 } 803 for (i = 0; i < m; i++) { /* loop over rows */ 804 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 805 a->diag[i] = -1; 806 for (j = 0; j < a->rlen[i]; j++) { 807 if (a->colidx[shift + a->sliceheight * j] == i) { 808 a->diag[i] = shift + a->sliceheight * j; 809 break; 810 } 811 } 812 } 813 PetscFunctionReturn(PETSC_SUCCESS); 814 } 815 816 /* 817 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 818 */ 819 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 820 { 821 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 822 PetscInt i, *diag, m = A->rmap->n; 823 MatScalar *val = a->val; 824 PetscScalar *idiag, *mdiag; 825 826 PetscFunctionBegin; 827 if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS); 828 PetscCall(MatMarkDiagonal_SeqSELL(A)); 829 diag = a->diag; 830 if (!a->idiag) { 831 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 832 val = a->val; 833 } 834 mdiag = a->mdiag; 835 idiag = a->idiag; 836 837 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 838 for (i = 0; i < m; i++) { 839 mdiag[i] = val[diag[i]]; 840 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 841 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 842 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 843 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 844 A->factorerror_zeropivot_value = 0.0; 845 A->factorerror_zeropivot_row = i; 846 } 847 idiag[i] = 1.0 / val[diag[i]]; 848 } 849 PetscCall(PetscLogFlops(m)); 850 } else { 851 for (i = 0; i < m; i++) { 852 mdiag[i] = val[diag[i]]; 853 idiag[i] = omega / (fshift + val[diag[i]]); 854 } 855 PetscCall(PetscLogFlops(2.0 * m)); 856 } 857 a->idiagvalid = PETSC_TRUE; 858 PetscFunctionReturn(PETSC_SUCCESS); 859 } 860 861 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 862 { 863 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 864 865 PetscFunctionBegin; 866 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 867 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 868 PetscFunctionReturn(PETSC_SUCCESS); 869 } 870 871 PetscErrorCode MatDestroy_SeqSELL(Mat A) 872 { 873 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 874 875 PetscFunctionBegin; 876 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 877 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 878 PetscCall(ISDestroy(&a->row)); 879 PetscCall(ISDestroy(&a->col)); 880 PetscCall(PetscFree(a->diag)); 881 PetscCall(PetscFree(a->rlen)); 882 PetscCall(PetscFree(a->sliidx)); 883 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 884 PetscCall(PetscFree(a->solve_work)); 885 PetscCall(ISDestroy(&a->icol)); 886 PetscCall(PetscFree(a->saved_values)); 887 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 888 PetscCall(PetscFree(A->data)); 889 #if defined(PETSC_HAVE_CUDA) 890 PetscCall(PetscFree(a->chunk_slice_map)); 891 #endif 892 893 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 894 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 895 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 896 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 897 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 898 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 899 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 900 #if defined(PETSC_HAVE_CUDA) 901 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL)); 902 #endif 903 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL)); 904 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL)); 905 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL)); 906 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL)); 907 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL)); 908 PetscFunctionReturn(PETSC_SUCCESS); 909 } 910 911 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 912 { 913 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 914 915 PetscFunctionBegin; 916 switch (op) { 917 case MAT_ROW_ORIENTED: 918 a->roworiented = flg; 919 break; 920 case MAT_KEEP_NONZERO_PATTERN: 921 a->keepnonzeropattern = flg; 922 break; 923 case MAT_NEW_NONZERO_LOCATIONS: 924 a->nonew = (flg ? 0 : 1); 925 break; 926 case MAT_NEW_NONZERO_LOCATION_ERR: 927 a->nonew = (flg ? -1 : 0); 928 break; 929 case MAT_NEW_NONZERO_ALLOCATION_ERR: 930 a->nonew = (flg ? -2 : 0); 931 break; 932 case MAT_UNUSED_NONZERO_LOCATION_ERR: 933 a->nounused = (flg ? -1 : 0); 934 break; 935 case MAT_FORCE_DIAGONAL_ENTRIES: 936 case MAT_IGNORE_OFF_PROC_ENTRIES: 937 case MAT_USE_HASH_TABLE: 938 case MAT_SORTED_FULL: 939 PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); 940 break; 941 case MAT_SPD: 942 case MAT_SYMMETRIC: 943 case MAT_STRUCTURALLY_SYMMETRIC: 944 case MAT_HERMITIAN: 945 case MAT_SYMMETRY_ETERNAL: 946 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 947 case MAT_SPD_ETERNAL: 948 /* These options are handled directly by MatSetOption() */ 949 break; 950 default: 951 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 952 } 953 PetscFunctionReturn(PETSC_SUCCESS); 954 } 955 956 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 957 { 958 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 959 PetscInt i, j, n, shift; 960 PetscScalar *x, zero = 0.0; 961 962 PetscFunctionBegin; 963 PetscCall(VecGetLocalSize(v, &n)); 964 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 965 966 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 967 PetscInt *diag = a->diag; 968 PetscCall(VecGetArray(v, &x)); 969 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 970 PetscCall(VecRestoreArray(v, &x)); 971 PetscFunctionReturn(PETSC_SUCCESS); 972 } 973 974 PetscCall(VecSet(v, zero)); 975 PetscCall(VecGetArray(v, &x)); 976 for (i = 0; i < n; i++) { /* loop over rows */ 977 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 978 x[i] = 0; 979 for (j = 0; j < a->rlen[i]; j++) { 980 if (a->colidx[shift + a->sliceheight * j] == i) { 981 x[i] = a->val[shift + a->sliceheight * j]; 982 break; 983 } 984 } 985 } 986 PetscCall(VecRestoreArray(v, &x)); 987 PetscFunctionReturn(PETSC_SUCCESS); 988 } 989 990 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 991 { 992 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 993 const PetscScalar *l, *r; 994 PetscInt i, j, m, n, row; 995 996 PetscFunctionBegin; 997 if (ll) { 998 /* The local size is used so that VecMPI can be passed to this routine 999 by MatDiagonalScale_MPISELL */ 1000 PetscCall(VecGetLocalSize(ll, &m)); 1001 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1002 PetscCall(VecGetArrayRead(ll, &l)); 1003 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1004 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1005 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1006 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row]; 1007 } 1008 } else { 1009 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; } 1010 } 1011 } 1012 PetscCall(VecRestoreArrayRead(ll, &l)); 1013 PetscCall(PetscLogFlops(a->nz)); 1014 } 1015 if (rr) { 1016 PetscCall(VecGetLocalSize(rr, &n)); 1017 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1018 PetscCall(VecGetArrayRead(rr, &r)); 1019 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1020 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1021 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) { 1022 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]]; 1023 } 1024 } else { 1025 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 1026 } 1027 } 1028 PetscCall(VecRestoreArrayRead(rr, &r)); 1029 PetscCall(PetscLogFlops(a->nz)); 1030 } 1031 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1032 #if defined(PETSC_HAVE_CUDA) 1033 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1034 #endif 1035 PetscFunctionReturn(PETSC_SUCCESS); 1036 } 1037 1038 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1039 { 1040 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1041 PetscInt *cp, i, k, low, high, t, row, col, l; 1042 PetscInt shift; 1043 MatScalar *vp; 1044 1045 PetscFunctionBegin; 1046 for (k = 0; k < m; k++) { /* loop over requested rows */ 1047 row = im[k]; 1048 if (row < 0) continue; 1049 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1050 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */ 1051 cp = a->colidx + shift; /* pointer to the row */ 1052 vp = a->val + shift; /* pointer to the row */ 1053 for (l = 0; l < n; l++) { /* loop over requested columns */ 1054 col = in[l]; 1055 if (col < 0) continue; 1056 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1057 high = a->rlen[row]; 1058 low = 0; /* assume unsorted */ 1059 while (high - low > 5) { 1060 t = (low + high) / 2; 1061 if (*(cp + a->sliceheight * t) > col) high = t; 1062 else low = t; 1063 } 1064 for (i = low; i < high; i++) { 1065 if (*(cp + a->sliceheight * i) > col) break; 1066 if (*(cp + a->sliceheight * i) == col) { 1067 *v++ = *(vp + a->sliceheight * i); 1068 goto finished; 1069 } 1070 } 1071 *v++ = 0.0; 1072 finished:; 1073 } 1074 } 1075 PetscFunctionReturn(PETSC_SUCCESS); 1076 } 1077 1078 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1079 { 1080 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1081 PetscInt i, j, m = A->rmap->n, shift; 1082 const char *name; 1083 PetscViewerFormat format; 1084 1085 PetscFunctionBegin; 1086 PetscCall(PetscViewerGetFormat(viewer, &format)); 1087 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1088 PetscInt nofinalvalue = 0; 1089 /* 1090 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1091 nofinalvalue = 1; 1092 } 1093 */ 1094 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1095 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1096 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1097 #if defined(PETSC_USE_COMPLEX) 1098 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1099 #else 1100 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1101 #endif 1102 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1103 1104 for (i = 0; i < m; i++) { 1105 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1106 for (j = 0; j < a->rlen[i]; j++) { 1107 #if defined(PETSC_USE_COMPLEX) 1108 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1109 #else 1110 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j])); 1111 #endif 1112 } 1113 } 1114 /* 1115 if (nofinalvalue) { 1116 #if defined(PETSC_USE_COMPLEX) 1117 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1118 #else 1119 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1120 #endif 1121 } 1122 */ 1123 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1124 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1125 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1126 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1127 PetscFunctionReturn(PETSC_SUCCESS); 1128 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1129 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1130 for (i = 0; i < m; i++) { 1131 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1132 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1133 for (j = 0; j < a->rlen[i]; j++) { 1134 #if defined(PETSC_USE_COMPLEX) 1135 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1136 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1137 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1138 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1139 } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1140 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1141 } 1142 #else 1143 if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1144 #endif 1145 } 1146 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1147 } 1148 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1149 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1150 PetscInt cnt = 0, jcnt; 1151 PetscScalar value; 1152 #if defined(PETSC_USE_COMPLEX) 1153 PetscBool realonly = PETSC_TRUE; 1154 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1155 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1156 realonly = PETSC_FALSE; 1157 break; 1158 } 1159 } 1160 #endif 1161 1162 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1163 for (i = 0; i < m; i++) { 1164 jcnt = 0; 1165 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1166 for (j = 0; j < A->cmap->n; j++) { 1167 if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) { 1168 value = a->val[cnt++]; 1169 jcnt++; 1170 } else { 1171 value = 0.0; 1172 } 1173 #if defined(PETSC_USE_COMPLEX) 1174 if (realonly) { 1175 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1176 } else { 1177 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1178 } 1179 #else 1180 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1181 #endif 1182 } 1183 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1184 } 1185 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1186 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1187 PetscInt fshift = 1; 1188 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1189 #if defined(PETSC_USE_COMPLEX) 1190 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1191 #else 1192 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1193 #endif 1194 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1195 for (i = 0; i < m; i++) { 1196 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1197 for (j = 0; j < a->rlen[i]; j++) { 1198 #if defined(PETSC_USE_COMPLEX) 1199 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1200 #else 1201 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j])); 1202 #endif 1203 } 1204 } 1205 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1206 } else if (format == PETSC_VIEWER_NATIVE) { 1207 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1208 PetscInt row; 1209 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1210 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1211 #if defined(PETSC_USE_COMPLEX) 1212 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1213 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1214 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1215 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1216 } else { 1217 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1218 } 1219 #else 1220 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j])); 1221 #endif 1222 } 1223 } 1224 } else { 1225 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1226 if (A->factortype) { 1227 for (i = 0; i < m; i++) { 1228 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1229 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1230 /* L part */ 1231 for (j = shift; j < a->diag[i]; j += a->sliceheight) { 1232 #if defined(PETSC_USE_COMPLEX) 1233 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) { 1234 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1235 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) { 1236 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1237 } else { 1238 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1239 } 1240 #else 1241 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1242 #endif 1243 } 1244 /* diagonal */ 1245 j = a->diag[i]; 1246 #if defined(PETSC_USE_COMPLEX) 1247 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1248 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1249 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1250 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1251 } else { 1252 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1253 } 1254 #else 1255 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1256 #endif 1257 1258 /* U part */ 1259 for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) { 1260 #if defined(PETSC_USE_COMPLEX) 1261 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1262 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1263 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1264 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1265 } else { 1266 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1267 } 1268 #else 1269 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1270 #endif 1271 } 1272 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1273 } 1274 } else { 1275 for (i = 0; i < m; i++) { 1276 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1277 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1278 for (j = 0; j < a->rlen[i]; j++) { 1279 #if defined(PETSC_USE_COMPLEX) 1280 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1281 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1282 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1283 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1284 } else { 1285 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1286 } 1287 #else 1288 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1289 #endif 1290 } 1291 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1292 } 1293 } 1294 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1295 } 1296 PetscCall(PetscViewerFlush(viewer)); 1297 PetscFunctionReturn(PETSC_SUCCESS); 1298 } 1299 1300 #include <petscdraw.h> 1301 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1302 { 1303 Mat A = (Mat)Aa; 1304 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1305 PetscInt i, j, m = A->rmap->n, shift; 1306 int color; 1307 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1308 PetscViewer viewer; 1309 PetscViewerFormat format; 1310 1311 PetscFunctionBegin; 1312 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1313 PetscCall(PetscViewerGetFormat(viewer, &format)); 1314 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1315 1316 /* loop over matrix elements drawing boxes */ 1317 1318 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1319 PetscDrawCollectiveBegin(draw); 1320 /* Blue for negative, Cyan for zero and Red for positive */ 1321 color = PETSC_DRAW_BLUE; 1322 for (i = 0; i < m; i++) { 1323 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1324 y_l = m - i - 1.0; 1325 y_r = y_l + 1.0; 1326 for (j = 0; j < a->rlen[i]; j++) { 1327 x_l = a->colidx[shift + a->sliceheight * j]; 1328 x_r = x_l + 1.0; 1329 if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue; 1330 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1331 } 1332 } 1333 color = PETSC_DRAW_CYAN; 1334 for (i = 0; i < m; i++) { 1335 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1336 y_l = m - i - 1.0; 1337 y_r = y_l + 1.0; 1338 for (j = 0; j < a->rlen[i]; j++) { 1339 x_l = a->colidx[shift + a->sliceheight * j]; 1340 x_r = x_l + 1.0; 1341 if (a->val[shift + a->sliceheight * j] != 0.) continue; 1342 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1343 } 1344 } 1345 color = PETSC_DRAW_RED; 1346 for (i = 0; i < m; i++) { 1347 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1348 y_l = m - i - 1.0; 1349 y_r = y_l + 1.0; 1350 for (j = 0; j < a->rlen[i]; j++) { 1351 x_l = a->colidx[shift + a->sliceheight * j]; 1352 x_r = x_l + 1.0; 1353 if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue; 1354 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1355 } 1356 } 1357 PetscDrawCollectiveEnd(draw); 1358 } else { 1359 /* use contour shading to indicate magnitude of values */ 1360 /* first determine max of all nonzero values */ 1361 PetscReal minv = 0.0, maxv = 0.0; 1362 PetscInt count = 0; 1363 PetscDraw popup; 1364 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1365 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1366 } 1367 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1368 PetscCall(PetscDrawGetPopup(draw, &popup)); 1369 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1370 1371 PetscDrawCollectiveBegin(draw); 1372 for (i = 0; i < m; i++) { 1373 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1374 y_l = m - i - 1.0; 1375 y_r = y_l + 1.0; 1376 for (j = 0; j < a->rlen[i]; j++) { 1377 x_l = a->colidx[shift + a->sliceheight * j]; 1378 x_r = x_l + 1.0; 1379 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1380 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1381 count++; 1382 } 1383 } 1384 PetscDrawCollectiveEnd(draw); 1385 } 1386 PetscFunctionReturn(PETSC_SUCCESS); 1387 } 1388 1389 #include <petscdraw.h> 1390 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1391 { 1392 PetscDraw draw; 1393 PetscReal xr, yr, xl, yl, h, w; 1394 PetscBool isnull; 1395 1396 PetscFunctionBegin; 1397 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1398 PetscCall(PetscDrawIsNull(draw, &isnull)); 1399 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1400 1401 xr = A->cmap->n; 1402 yr = A->rmap->n; 1403 h = yr / 10.0; 1404 w = xr / 10.0; 1405 xr += w; 1406 yr += h; 1407 xl = -w; 1408 yl = -h; 1409 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1410 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1411 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1412 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1413 PetscCall(PetscDrawSave(draw)); 1414 PetscFunctionReturn(PETSC_SUCCESS); 1415 } 1416 1417 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1418 { 1419 PetscBool iascii, isbinary, isdraw; 1420 1421 PetscFunctionBegin; 1422 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1423 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1424 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1425 if (iascii) { 1426 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1427 } else if (isbinary) { 1428 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1429 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1430 PetscFunctionReturn(PETSC_SUCCESS); 1431 } 1432 1433 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1434 { 1435 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1436 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1437 MatScalar *vp; 1438 #if defined(PETSC_HAVE_CUDA) 1439 PetscInt totalchunks = 0; 1440 #endif 1441 1442 PetscFunctionBegin; 1443 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1444 /* To do: compress out the unused elements */ 1445 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1446 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1447 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1448 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1449 a->nonzerorowcnt = 0; 1450 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1451 for (i = 0; i < a->totalslices; ++i) { 1452 shift = a->sliidx[i]; /* starting index of the slice */ 1453 cp = PetscSafePointerPlusOffset(a->colidx, shift); /* pointer to the column indices of the slice */ 1454 vp = PetscSafePointerPlusOffset(a->val, shift); /* pointer to the nonzero values of the slice */ 1455 for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */ 1456 row = a->sliceheight * i + row_in_slice; 1457 nrow = a->rlen[row]; /* number of nonzeros in row */ 1458 /* 1459 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1460 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1461 */ 1462 lastcol = 0; 1463 if (nrow > 0) { /* nonempty row */ 1464 a->nonzerorowcnt++; 1465 lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1466 } else if (!row_in_slice) { /* first row of the correct slice is empty */ 1467 for (j = 1; j < a->sliceheight; j++) { 1468 if (a->rlen[a->sliceheight * i + j]) { 1469 lastcol = cp[j]; 1470 break; 1471 } 1472 } 1473 } else { 1474 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1475 } 1476 1477 for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) { 1478 cp[a->sliceheight * k + row_in_slice] = lastcol; 1479 vp[a->sliceheight * k + row_in_slice] = (MatScalar)0; 1480 } 1481 } 1482 } 1483 1484 A->info.mallocs += a->reallocs; 1485 a->reallocs = 0; 1486 1487 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1488 #if defined(PETSC_HAVE_CUDA) 1489 if (!a->chunksize && a->totalslices) { 1490 a->chunksize = 64; 1491 while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2; 1492 totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize; 1493 } 1494 if (totalchunks != a->totalchunks) { 1495 PetscCall(PetscFree(a->chunk_slice_map)); 1496 PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map)); 1497 a->totalchunks = totalchunks; 1498 } 1499 j = 0; 1500 for (i = 0; i < totalchunks; i++) { 1501 while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++; 1502 a->chunk_slice_map[i] = j; 1503 } 1504 #endif 1505 PetscFunctionReturn(PETSC_SUCCESS); 1506 } 1507 1508 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1509 { 1510 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1511 1512 PetscFunctionBegin; 1513 info->block_size = 1.0; 1514 info->nz_allocated = a->maxallocmat; 1515 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1516 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1517 info->assemblies = A->num_ass; 1518 info->mallocs = A->info.mallocs; 1519 info->memory = 0; /* REVIEW ME */ 1520 if (A->factortype) { 1521 info->fill_ratio_given = A->info.fill_ratio_given; 1522 info->fill_ratio_needed = A->info.fill_ratio_needed; 1523 info->factor_mallocs = A->info.factor_mallocs; 1524 } else { 1525 info->fill_ratio_given = 0; 1526 info->fill_ratio_needed = 0; 1527 info->factor_mallocs = 0; 1528 } 1529 PetscFunctionReturn(PETSC_SUCCESS); 1530 } 1531 1532 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1533 { 1534 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1535 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1536 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1537 MatScalar *vp, value; 1538 #if defined(PETSC_HAVE_CUDA) 1539 PetscBool inserted = PETSC_FALSE; 1540 PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight; 1541 #endif 1542 1543 PetscFunctionBegin; 1544 for (k = 0; k < m; k++) { /* loop over added rows */ 1545 row = im[k]; 1546 if (row < 0) continue; 1547 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1548 shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */ 1549 cp = a->colidx + shift; /* pointer to the row */ 1550 vp = a->val + shift; /* pointer to the row */ 1551 nrow = a->rlen[row]; 1552 low = 0; 1553 high = nrow; 1554 1555 for (l = 0; l < n; l++) { /* loop over added columns */ 1556 col = in[l]; 1557 if (col < 0) continue; 1558 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1559 if (a->roworiented) { 1560 value = v[l + k * n]; 1561 } else { 1562 value = v[k + l * m]; 1563 } 1564 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1565 1566 /* search in this row for the specified column, i indicates the column to be set */ 1567 if (col <= lastcol) low = 0; 1568 else high = nrow; 1569 lastcol = col; 1570 while (high - low > 5) { 1571 t = (low + high) / 2; 1572 if (*(cp + a->sliceheight * t) > col) high = t; 1573 else low = t; 1574 } 1575 for (i = low; i < high; i++) { 1576 if (*(cp + a->sliceheight * i) > col) break; 1577 if (*(cp + a->sliceheight * i) == col) { 1578 if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value; 1579 else *(vp + a->sliceheight * i) = value; 1580 #if defined(PETSC_HAVE_CUDA) 1581 inserted = PETSC_TRUE; 1582 #endif 1583 low = i + 1; 1584 goto noinsert; 1585 } 1586 } 1587 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1588 if (nonew == 1) goto noinsert; 1589 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1590 #if defined(PETSC_HAVE_CUDA) 1591 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul); 1592 #else 1593 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1594 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1); 1595 #endif 1596 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1597 for (ii = nrow - 1; ii >= i; ii--) { 1598 *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); 1599 *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); 1600 } 1601 a->rlen[row]++; 1602 *(cp + a->sliceheight * i) = col; 1603 *(vp + a->sliceheight * i) = value; 1604 a->nz++; 1605 A->nonzerostate++; 1606 #if defined(PETSC_HAVE_CUDA) 1607 inserted = PETSC_TRUE; 1608 #endif 1609 low = i + 1; 1610 high++; 1611 nrow++; 1612 noinsert:; 1613 } 1614 a->rlen[row] = nrow; 1615 } 1616 #if defined(PETSC_HAVE_CUDA) 1617 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU; 1618 #endif 1619 PetscFunctionReturn(PETSC_SUCCESS); 1620 } 1621 1622 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1623 { 1624 PetscFunctionBegin; 1625 /* If the two matrices have the same copy implementation, use fast copy. */ 1626 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1627 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1628 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1629 1630 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1631 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1632 } else { 1633 PetscCall(MatCopy_Basic(A, B, str)); 1634 } 1635 PetscFunctionReturn(PETSC_SUCCESS); 1636 } 1637 1638 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1639 { 1640 PetscFunctionBegin; 1641 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1642 PetscFunctionReturn(PETSC_SUCCESS); 1643 } 1644 1645 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1646 { 1647 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1648 1649 PetscFunctionBegin; 1650 *array = a->val; 1651 PetscFunctionReturn(PETSC_SUCCESS); 1652 } 1653 1654 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1655 { 1656 PetscFunctionBegin; 1657 PetscFunctionReturn(PETSC_SUCCESS); 1658 } 1659 1660 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1661 { 1662 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1663 MatScalar *aval = a->val; 1664 PetscScalar oalpha = alpha; 1665 PetscBLASInt one = 1, size; 1666 1667 PetscFunctionBegin; 1668 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1669 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1670 PetscCall(PetscLogFlops(a->nz)); 1671 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1672 #if defined(PETSC_HAVE_CUDA) 1673 if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU; 1674 #endif 1675 PetscFunctionReturn(PETSC_SUCCESS); 1676 } 1677 1678 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1679 { 1680 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1681 1682 PetscFunctionBegin; 1683 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1684 PetscCall(MatShift_Basic(Y, a)); 1685 PetscFunctionReturn(PETSC_SUCCESS); 1686 } 1687 1688 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1689 { 1690 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1691 PetscScalar *x, sum, *t; 1692 const MatScalar *idiag = NULL, *mdiag; 1693 const PetscScalar *b, *xb; 1694 PetscInt n, m = A->rmap->n, i, j, shift; 1695 const PetscInt *diag; 1696 1697 PetscFunctionBegin; 1698 its = its * lits; 1699 1700 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1701 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1702 a->fshift = fshift; 1703 a->omega = omega; 1704 1705 diag = a->diag; 1706 t = a->ssor_work; 1707 idiag = a->idiag; 1708 mdiag = a->mdiag; 1709 1710 PetscCall(VecGetArray(xx, &x)); 1711 PetscCall(VecGetArrayRead(bb, &b)); 1712 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1713 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1714 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1715 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1716 1717 if (flag & SOR_ZERO_INITIAL_GUESS) { 1718 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1719 for (i = 0; i < m; i++) { 1720 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1721 sum = b[i]; 1722 n = (diag[i] - shift) / a->sliceheight; 1723 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1724 t[i] = sum; 1725 x[i] = sum * idiag[i]; 1726 } 1727 xb = t; 1728 PetscCall(PetscLogFlops(a->nz)); 1729 } else xb = b; 1730 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1731 for (i = m - 1; i >= 0; i--) { 1732 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1733 sum = xb[i]; 1734 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1735 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1736 if (xb == b) { 1737 x[i] = sum * idiag[i]; 1738 } else { 1739 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1740 } 1741 } 1742 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1743 } 1744 its--; 1745 } 1746 while (its--) { 1747 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1748 for (i = 0; i < m; i++) { 1749 /* lower */ 1750 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1751 sum = b[i]; 1752 n = (diag[i] - shift) / a->sliceheight; 1753 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1754 t[i] = sum; /* save application of the lower-triangular part */ 1755 /* upper */ 1756 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1757 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1758 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1759 } 1760 xb = t; 1761 PetscCall(PetscLogFlops(2.0 * a->nz)); 1762 } else xb = b; 1763 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1764 for (i = m - 1; i >= 0; i--) { 1765 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1766 sum = xb[i]; 1767 if (xb == b) { 1768 /* whole matrix (no checkpointing available) */ 1769 n = a->rlen[i]; 1770 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1771 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1772 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1773 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1774 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1775 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1776 } 1777 } 1778 if (xb == b) { 1779 PetscCall(PetscLogFlops(2.0 * a->nz)); 1780 } else { 1781 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1782 } 1783 } 1784 } 1785 PetscCall(VecRestoreArray(xx, &x)); 1786 PetscCall(VecRestoreArrayRead(bb, &b)); 1787 PetscFunctionReturn(PETSC_SUCCESS); 1788 } 1789 1790 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1791 MatGetRow_SeqSELL, 1792 MatRestoreRow_SeqSELL, 1793 MatMult_SeqSELL, 1794 /* 4*/ MatMultAdd_SeqSELL, 1795 MatMultTranspose_SeqSELL, 1796 MatMultTransposeAdd_SeqSELL, 1797 NULL, 1798 NULL, 1799 NULL, 1800 /* 10*/ NULL, 1801 NULL, 1802 NULL, 1803 MatSOR_SeqSELL, 1804 NULL, 1805 /* 15*/ MatGetInfo_SeqSELL, 1806 MatEqual_SeqSELL, 1807 MatGetDiagonal_SeqSELL, 1808 MatDiagonalScale_SeqSELL, 1809 NULL, 1810 /* 20*/ NULL, 1811 MatAssemblyEnd_SeqSELL, 1812 MatSetOption_SeqSELL, 1813 MatZeroEntries_SeqSELL, 1814 /* 24*/ NULL, 1815 NULL, 1816 NULL, 1817 NULL, 1818 NULL, 1819 /* 29*/ MatSetUp_SeqSELL, 1820 NULL, 1821 NULL, 1822 NULL, 1823 NULL, 1824 /* 34*/ MatDuplicate_SeqSELL, 1825 NULL, 1826 NULL, 1827 NULL, 1828 NULL, 1829 /* 39*/ NULL, 1830 NULL, 1831 NULL, 1832 MatGetValues_SeqSELL, 1833 MatCopy_SeqSELL, 1834 /* 44*/ NULL, 1835 MatScale_SeqSELL, 1836 MatShift_SeqSELL, 1837 NULL, 1838 NULL, 1839 /* 49*/ NULL, 1840 NULL, 1841 NULL, 1842 NULL, 1843 NULL, 1844 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1845 NULL, 1846 NULL, 1847 NULL, 1848 NULL, 1849 /* 59*/ NULL, 1850 MatDestroy_SeqSELL, 1851 MatView_SeqSELL, 1852 NULL, 1853 NULL, 1854 /* 64*/ NULL, 1855 NULL, 1856 NULL, 1857 NULL, 1858 NULL, 1859 /* 69*/ NULL, 1860 NULL, 1861 NULL, 1862 NULL, 1863 NULL, 1864 /* 74*/ NULL, 1865 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1866 NULL, 1867 NULL, 1868 NULL, 1869 /* 79*/ NULL, 1870 NULL, 1871 NULL, 1872 NULL, 1873 NULL, 1874 /* 84*/ NULL, 1875 NULL, 1876 NULL, 1877 NULL, 1878 NULL, 1879 /* 89*/ NULL, 1880 NULL, 1881 NULL, 1882 NULL, 1883 NULL, 1884 /* 94*/ NULL, 1885 NULL, 1886 NULL, 1887 NULL, 1888 NULL, 1889 /* 99*/ NULL, 1890 NULL, 1891 NULL, 1892 MatConjugate_SeqSELL, 1893 NULL, 1894 /*104*/ NULL, 1895 NULL, 1896 NULL, 1897 NULL, 1898 NULL, 1899 /*109*/ NULL, 1900 NULL, 1901 NULL, 1902 NULL, 1903 MatMissingDiagonal_SeqSELL, 1904 /*114*/ NULL, 1905 NULL, 1906 NULL, 1907 NULL, 1908 NULL, 1909 /*119*/ NULL, 1910 NULL, 1911 NULL, 1912 NULL, 1913 NULL, 1914 /*124*/ NULL, 1915 NULL, 1916 NULL, 1917 NULL, 1918 NULL, 1919 /*129*/ NULL, 1920 NULL, 1921 NULL, 1922 NULL, 1923 NULL, 1924 /*134*/ NULL, 1925 NULL, 1926 NULL, 1927 NULL, 1928 NULL, 1929 /*139*/ NULL, 1930 NULL, 1931 NULL, 1932 MatFDColoringSetUp_SeqXAIJ, 1933 NULL, 1934 /*144*/ NULL, 1935 NULL, 1936 NULL, 1937 NULL, 1938 NULL, 1939 NULL, 1940 /*150*/ NULL, 1941 NULL, 1942 NULL}; 1943 1944 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1945 { 1946 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1947 1948 PetscFunctionBegin; 1949 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1950 1951 /* allocate space for values if not already there */ 1952 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1953 1954 /* copy values over */ 1955 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1956 PetscFunctionReturn(PETSC_SUCCESS); 1957 } 1958 1959 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1960 { 1961 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1962 1963 PetscFunctionBegin; 1964 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1965 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1966 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1967 PetscFunctionReturn(PETSC_SUCCESS); 1968 } 1969 1970 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio) 1971 { 1972 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1973 1974 PetscFunctionBegin; 1975 if (a->totalslices && a->sliidx[a->totalslices]) { 1976 *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices]; 1977 } else { 1978 *ratio = 0.0; 1979 } 1980 PetscFunctionReturn(PETSC_SUCCESS); 1981 } 1982 1983 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth) 1984 { 1985 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1986 PetscInt i, current_slicewidth; 1987 1988 PetscFunctionBegin; 1989 *slicewidth = 0; 1990 for (i = 0; i < a->totalslices; i++) { 1991 current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight; 1992 if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth; 1993 } 1994 PetscFunctionReturn(PETSC_SUCCESS); 1995 } 1996 1997 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth) 1998 { 1999 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2000 2001 PetscFunctionBegin; 2002 *slicewidth = 0; 2003 if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; } 2004 PetscFunctionReturn(PETSC_SUCCESS); 2005 } 2006 2007 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance) 2008 { 2009 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2010 PetscReal mean; 2011 PetscInt i, totalslices = a->totalslices, *sliidx = a->sliidx; 2012 2013 PetscFunctionBegin; 2014 *variance = 0; 2015 if (totalslices) { 2016 mean = (PetscReal)sliidx[totalslices] / totalslices; 2017 for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; } 2018 } 2019 PetscFunctionReturn(PETSC_SUCCESS); 2020 } 2021 2022 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight) 2023 { 2024 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2025 2026 PetscFunctionBegin; 2027 if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS); 2028 PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight); 2029 a->sliceheight = sliceheight; 2030 #if defined(PETSC_HAVE_CUDA) 2031 PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight); 2032 #endif 2033 PetscFunctionReturn(PETSC_SUCCESS); 2034 } 2035 2036 /*@C 2037 MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix. 2038 2039 Not Collective 2040 2041 Input Parameter: 2042 . A - a MATSEQSELL matrix 2043 2044 Output Parameter: 2045 . ratio - ratio of number of padded zeros to number of allocated elements 2046 2047 Level: intermediate 2048 2049 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2050 @*/ 2051 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio) 2052 { 2053 PetscFunctionBegin; 2054 PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio)); 2055 PetscFunctionReturn(PETSC_SUCCESS); 2056 } 2057 2058 /*@C 2059 MatSeqSELLGetMaxSliceWidth - returns the maximum slice width. 2060 2061 Not Collective 2062 2063 Input Parameter: 2064 . A - a MATSEQSELL matrix 2065 2066 Output Parameter: 2067 . slicewidth - maximum slice width 2068 2069 Level: intermediate 2070 2071 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2072 @*/ 2073 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth) 2074 { 2075 PetscFunctionBegin; 2076 PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth)); 2077 PetscFunctionReturn(PETSC_SUCCESS); 2078 } 2079 2080 /*@C 2081 MatSeqSELLGetAvgSliceWidth - returns the average slice width. 2082 2083 Not Collective 2084 2085 Input Parameter: 2086 . A - a MATSEQSELL matrix 2087 2088 Output Parameter: 2089 . slicewidth - average slice width 2090 2091 Level: intermediate 2092 2093 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()` 2094 @*/ 2095 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth) 2096 { 2097 PetscFunctionBegin; 2098 PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth)); 2099 PetscFunctionReturn(PETSC_SUCCESS); 2100 } 2101 2102 /*@C 2103 MatSeqSELLSetSliceHeight - sets the slice height. 2104 2105 Not Collective 2106 2107 Input Parameters: 2108 + A - a MATSEQSELL matrix 2109 - sliceheight - slice height 2110 2111 Notes: 2112 You cannot change the slice height once it have been set. 2113 2114 The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called. 2115 2116 Level: intermediate 2117 2118 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()` 2119 @*/ 2120 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight) 2121 { 2122 PetscFunctionBegin; 2123 PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight)); 2124 PetscFunctionReturn(PETSC_SUCCESS); 2125 } 2126 2127 /*@C 2128 MatSeqSELLGetVarSliceSize - returns the variance of the slice size. 2129 2130 Not Collective 2131 2132 Input Parameter: 2133 . A - a MATSEQSELL matrix 2134 2135 Output Parameter: 2136 . variance - variance of the slice size 2137 2138 Level: intermediate 2139 2140 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()` 2141 @*/ 2142 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance) 2143 { 2144 PetscFunctionBegin; 2145 PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance)); 2146 PetscFunctionReturn(PETSC_SUCCESS); 2147 } 2148 2149 #if defined(PETSC_HAVE_CUDA) 2150 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 2151 #endif 2152 2153 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 2154 { 2155 Mat_SeqSELL *b; 2156 PetscMPIInt size; 2157 2158 PetscFunctionBegin; 2159 PetscCall(PetscCitationsRegister(citation, &cited)); 2160 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 2161 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 2162 2163 PetscCall(PetscNew(&b)); 2164 2165 B->data = (void *)b; 2166 B->ops[0] = MatOps_Values; 2167 2168 b->row = NULL; 2169 b->col = NULL; 2170 b->icol = NULL; 2171 b->reallocs = 0; 2172 b->ignorezeroentries = PETSC_FALSE; 2173 b->roworiented = PETSC_TRUE; 2174 b->nonew = 0; 2175 b->diag = NULL; 2176 b->solve_work = NULL; 2177 B->spptr = NULL; 2178 b->saved_values = NULL; 2179 b->idiag = NULL; 2180 b->mdiag = NULL; 2181 b->ssor_work = NULL; 2182 b->omega = 1.0; 2183 b->fshift = 0.0; 2184 b->idiagvalid = PETSC_FALSE; 2185 b->keepnonzeropattern = PETSC_FALSE; 2186 b->sliceheight = 0; 2187 2188 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 2189 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 2190 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 2191 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 2192 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 2193 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 2194 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 2195 #if defined(PETSC_HAVE_CUDA) 2196 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA)); 2197 #endif 2198 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL)); 2199 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL)); 2200 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL)); 2201 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL)); 2202 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL)); 2203 2204 PetscObjectOptionsBegin((PetscObject)B); 2205 { 2206 PetscInt newsh = -1; 2207 PetscBool flg; 2208 #if defined(PETSC_HAVE_CUDA) 2209 PetscInt chunksize = 0; 2210 #endif 2211 2212 PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg)); 2213 if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); } 2214 #if defined(PETSC_HAVE_CUDA) 2215 PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg)); 2216 if (flg) { 2217 PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize); 2218 b->chunksize = chunksize; 2219 } 2220 #endif 2221 } 2222 PetscOptionsEnd(); 2223 PetscFunctionReturn(PETSC_SUCCESS); 2224 } 2225 2226 /* 2227 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 2228 */ 2229 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 2230 { 2231 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 2232 PetscInt i, m = A->rmap->n; 2233 PetscInt totalslices = a->totalslices; 2234 2235 PetscFunctionBegin; 2236 C->factortype = A->factortype; 2237 c->row = NULL; 2238 c->col = NULL; 2239 c->icol = NULL; 2240 c->reallocs = 0; 2241 C->assembled = PETSC_TRUE; 2242 2243 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2244 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2245 2246 PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen)); 2247 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2248 2249 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2250 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2251 2252 /* allocate the matrix space */ 2253 if (mallocmatspace) { 2254 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2255 2256 c->singlemalloc = PETSC_TRUE; 2257 2258 if (m > 0) { 2259 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2260 if (cpvalues == MAT_COPY_VALUES) { 2261 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2262 } else { 2263 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2264 } 2265 } 2266 } 2267 2268 c->ignorezeroentries = a->ignorezeroentries; 2269 c->roworiented = a->roworiented; 2270 c->nonew = a->nonew; 2271 if (a->diag) { 2272 PetscCall(PetscMalloc1(m, &c->diag)); 2273 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2274 } else c->diag = NULL; 2275 2276 c->solve_work = NULL; 2277 c->saved_values = NULL; 2278 c->idiag = NULL; 2279 c->ssor_work = NULL; 2280 c->keepnonzeropattern = a->keepnonzeropattern; 2281 c->free_val = PETSC_TRUE; 2282 c->free_colidx = PETSC_TRUE; 2283 2284 c->maxallocmat = a->maxallocmat; 2285 c->maxallocrow = a->maxallocrow; 2286 c->rlenmax = a->rlenmax; 2287 c->nz = a->nz; 2288 C->preallocated = PETSC_TRUE; 2289 2290 c->nonzerorowcnt = a->nonzerorowcnt; 2291 C->nonzerostate = A->nonzerostate; 2292 2293 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2294 PetscFunctionReturn(PETSC_SUCCESS); 2295 } 2296 2297 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2298 { 2299 PetscFunctionBegin; 2300 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2301 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2302 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2303 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2304 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2305 PetscFunctionReturn(PETSC_SUCCESS); 2306 } 2307 2308 /*MC 2309 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2310 based on the sliced Ellpack format, {cite}`zhangellpack2018` 2311 2312 Options Database Key: 2313 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2314 2315 Level: beginner 2316 2317 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2318 M*/ 2319 2320 /*MC 2321 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018` 2322 2323 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2324 and `MATMPISELL` otherwise. As a result, for single process communicators, 2325 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2326 for communicators controlling multiple processes. It is recommended that you call both of 2327 the above preallocation routines for simplicity. 2328 2329 Options Database Key: 2330 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2331 2332 Level: beginner 2333 2334 Notes: 2335 This format is only supported for real scalars, double precision, and 32-bit indices (the defaults). 2336 2337 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2338 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2339 2340 Developer Notes: 2341 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2342 2343 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2344 .vb 2345 (2 0 3 4) 2346 Consider the matrix A = (5 0 6 0) 2347 (0 0 7 8) 2348 (0 0 9 9) 2349 2350 symbolically the Ellpack format can be written as 2351 2352 (2 3 4 |) (0 2 3 |) 2353 v = (5 6 0 |) colidx = (0 2 2 |) 2354 -------- --------- 2355 (7 8 |) (2 3 |) 2356 (9 9 |) (2 3 |) 2357 2358 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2359 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2360 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2361 2362 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2363 2364 .ve 2365 2366 See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance. 2367 2368 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2369 M*/ 2370 2371 /*@C 2372 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2373 2374 Collective 2375 2376 Input Parameters: 2377 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2378 . m - number of rows 2379 . n - number of columns 2380 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided 2381 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL 2382 2383 Output Parameter: 2384 . A - the matrix 2385 2386 Level: intermediate 2387 2388 Notes: 2389 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2390 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2391 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2392 2393 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 2394 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 2395 allocation. 2396 2397 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL` 2398 @*/ 2399 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A) 2400 { 2401 PetscFunctionBegin; 2402 PetscCall(MatCreate(comm, A)); 2403 PetscCall(MatSetSizes(*A, m, n, m, n)); 2404 PetscCall(MatSetType(*A, MATSEQSELL)); 2405 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen)); 2406 PetscFunctionReturn(PETSC_SUCCESS); 2407 } 2408 2409 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2410 { 2411 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2412 PetscInt totalslices = a->totalslices; 2413 2414 PetscFunctionBegin; 2415 /* If the matrix dimensions are not equal,or no of nonzeros */ 2416 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2417 *flg = PETSC_FALSE; 2418 PetscFunctionReturn(PETSC_SUCCESS); 2419 } 2420 /* if the a->colidx are the same */ 2421 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2422 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2423 /* if a->val are the same */ 2424 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2425 PetscFunctionReturn(PETSC_SUCCESS); 2426 } 2427 2428 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2429 { 2430 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2431 2432 PetscFunctionBegin; 2433 a->idiagvalid = PETSC_FALSE; 2434 PetscFunctionReturn(PETSC_SUCCESS); 2435 } 2436 2437 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2438 { 2439 #if defined(PETSC_USE_COMPLEX) 2440 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2441 PetscInt i; 2442 PetscScalar *val = a->val; 2443 2444 PetscFunctionBegin; 2445 for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); } 2446 #if defined(PETSC_HAVE_CUDA) 2447 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 2448 #endif 2449 #else 2450 PetscFunctionBegin; 2451 #endif 2452 PetscFunctionReturn(PETSC_SUCCESS); 2453 } 2454