1 /* 2 Defines the basic matrix operations for the SELL matrix storage format. 3 */ 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 #include <petscblaslapack.h> 6 #include <petsc/private/kernels/blocktranspose.h> 7 8 static PetscBool cited = PETSC_FALSE; 9 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 10 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 11 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 12 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 13 " year = 2018\n" 14 "}\n"; 15 16 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 17 18 #include <immintrin.h> 19 20 #if !defined(_MM_SCALE_8) 21 #define _MM_SCALE_8 8 22 #endif 23 24 #if defined(__AVX512F__) 25 /* these do not work 26 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 27 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 28 */ 29 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 30 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 31 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 32 vec_vals = _mm512_loadu_pd(aval); \ 33 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 34 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 35 #elif defined(__AVX2__) && defined(__FMA__) 36 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 37 vec_vals = _mm256_loadu_pd(aval); \ 38 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 39 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 40 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 41 #endif 42 #endif /* PETSC_HAVE_IMMINTRIN_H */ 43 44 /*@C 45 MatSeqSELLSetPreallocation - For good matrix assembly performance 46 the user should preallocate the matrix storage by setting the parameter `nz` 47 (or the array `nnz`). 48 49 Collective 50 51 Input Parameters: 52 + B - The `MATSEQSELL` matrix 53 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided 54 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 55 56 Level: intermediate 57 58 Notes: 59 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 60 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 61 allocation. 62 63 You can call `MatGetInfo()` to get information on how effective the preallocation was; 64 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 65 You can also run with the option `-info` and look for messages with the string 66 malloc in them to see if additional memory allocation was needed. 67 68 Developer Notes: 69 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 70 entries or columns indices. 71 72 The maximum number of nonzeos in any row should be as accurate as possible. 73 If it is underestimated, you will get bad performance due to reallocation 74 (`MatSeqXSELLReallocateSELL()`). 75 76 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 77 @*/ 78 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 79 { 80 PetscFunctionBegin; 81 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 82 PetscValidType(B, 1); 83 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 84 PetscFunctionReturn(PETSC_SUCCESS); 85 } 86 87 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 88 { 89 Mat_SeqSELL *b; 90 PetscInt i, j, totalslices; 91 #if defined(PETSC_HAVE_CUDA) 92 PetscInt rlenmax = 0; 93 #endif 94 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 95 96 PetscFunctionBegin; 97 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 98 if (maxallocrow == MAT_SKIP_ALLOCATION) { 99 skipallocation = PETSC_TRUE; 100 maxallocrow = 0; 101 } 102 103 PetscCall(PetscLayoutSetUp(B->rmap)); 104 PetscCall(PetscLayoutSetUp(B->cmap)); 105 106 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 107 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 108 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 109 if (rlen) { 110 for (i = 0; i < B->rmap->n; i++) { 111 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 112 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 113 } 114 } 115 116 B->preallocated = PETSC_TRUE; 117 118 b = (Mat_SeqSELL *)B->data; 119 120 if (!b->sliceheight) { /* not set yet */ 121 #if defined(PETSC_HAVE_CUDA) 122 b->sliceheight = 16; 123 #else 124 b->sliceheight = 8; 125 #endif 126 } 127 totalslices = PetscCeilInt(B->rmap->n, b->sliceheight); 128 b->totalslices = totalslices; 129 if (!skipallocation) { 130 if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n)); 131 132 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 133 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 134 } 135 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 136 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 137 else if (maxallocrow < 0) maxallocrow = 1; 138 #if defined(PETSC_HAVE_CUDA) 139 rlenmax = maxallocrow; 140 /* Pad the slice to DEVICE_MEM_ALIGN */ 141 while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++; 142 #endif 143 for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow; 144 } else { 145 #if defined(PETSC_HAVE_CUDA) 146 PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight; 147 #endif 148 maxallocrow = 0; 149 b->sliidx[0] = 0; 150 for (i = 1; i < totalslices; i++) { 151 b->sliidx[i] = 0; 152 for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); } 153 #if defined(PETSC_HAVE_CUDA) 154 rlenmax = PetscMax(b->sliidx[i], rlenmax); 155 /* Pad the slice to DEVICE_MEM_ALIGN */ 156 b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul; 157 #endif 158 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 159 PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i])); 160 } 161 /* last slice */ 162 b->sliidx[totalslices] = 0; 163 for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 164 #if defined(PETSC_HAVE_CUDA) 165 rlenmax = PetscMax(b->sliidx[i], rlenmax); 166 b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul; 167 #endif 168 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 169 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices]; 170 } 171 172 /* allocate space for val, colidx, rlen */ 173 /* FIXME: should B's old memory be unlogged? */ 174 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 175 /* FIXME: assuming an element of the bit array takes 8 bits */ 176 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 177 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 178 PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen)); 179 180 b->singlemalloc = PETSC_TRUE; 181 b->free_val = PETSC_TRUE; 182 b->free_colidx = PETSC_TRUE; 183 } else { 184 b->free_val = PETSC_FALSE; 185 b->free_colidx = PETSC_FALSE; 186 } 187 188 b->nz = 0; 189 b->maxallocrow = maxallocrow; 190 #if defined(PETSC_HAVE_CUDA) 191 b->rlenmax = rlenmax; 192 #else 193 b->rlenmax = maxallocrow; 194 #endif 195 b->maxallocmat = b->sliidx[totalslices]; 196 B->info.nz_unneeded = (double)b->maxallocmat; 197 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 198 PetscFunctionReturn(PETSC_SUCCESS); 199 } 200 201 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 202 { 203 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 204 PetscInt shift; 205 206 PetscFunctionBegin; 207 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 208 if (nz) *nz = a->rlen[row]; 209 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); 210 if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); } 211 if (idx) { 212 PetscInt j; 213 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j]; 214 *idx = a->getrowcols; 215 } 216 if (v) { 217 PetscInt j; 218 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j]; 219 *v = a->getrowvals; 220 } 221 PetscFunctionReturn(PETSC_SUCCESS); 222 } 223 224 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 225 { 226 PetscFunctionBegin; 227 PetscFunctionReturn(PETSC_SUCCESS); 228 } 229 230 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 231 { 232 Mat B; 233 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 234 PetscInt i; 235 236 PetscFunctionBegin; 237 if (reuse == MAT_REUSE_MATRIX) { 238 B = *newmat; 239 PetscCall(MatZeroEntries(B)); 240 } else { 241 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 242 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 243 PetscCall(MatSetType(B, MATSEQAIJ)); 244 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 245 } 246 247 for (i = 0; i < A->rmap->n; i++) { 248 PetscInt nz = 0, *cols = NULL; 249 PetscScalar *vals = NULL; 250 251 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 252 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 253 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 254 } 255 256 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 257 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 258 B->rmap->bs = A->rmap->bs; 259 260 if (reuse == MAT_INPLACE_MATRIX) { 261 PetscCall(MatHeaderReplace(A, &B)); 262 } else { 263 *newmat = B; 264 } 265 PetscFunctionReturn(PETSC_SUCCESS); 266 } 267 268 #include <../src/mat/impls/aij/seq/aij.h> 269 270 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 271 { 272 Mat B; 273 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 274 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 275 const PetscInt *cols; 276 const PetscScalar *vals; 277 278 PetscFunctionBegin; 279 280 if (reuse == MAT_REUSE_MATRIX) { 281 B = *newmat; 282 } else { 283 if (PetscDefined(USE_DEBUG) || !a->ilen) { 284 PetscCall(PetscMalloc1(m, &rowlengths)); 285 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 286 } 287 if (PetscDefined(USE_DEBUG) && a->ilen) { 288 PetscBool eq; 289 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 290 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 291 PetscCall(PetscFree(rowlengths)); 292 rowlengths = a->ilen; 293 } else if (a->ilen) rowlengths = a->ilen; 294 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 295 PetscCall(MatSetSizes(B, m, n, m, n)); 296 PetscCall(MatSetType(B, MATSEQSELL)); 297 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 298 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 299 } 300 301 for (row = 0; row < m; row++) { 302 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 303 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 304 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 305 } 306 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 307 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 308 B->rmap->bs = A->rmap->bs; 309 310 if (reuse == MAT_INPLACE_MATRIX) { 311 PetscCall(MatHeaderReplace(A, &B)); 312 } else { 313 *newmat = B; 314 } 315 PetscFunctionReturn(PETSC_SUCCESS); 316 } 317 318 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 319 { 320 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 321 PetscScalar *y; 322 const PetscScalar *x; 323 const MatScalar *aval = a->val; 324 PetscInt totalslices = a->totalslices; 325 const PetscInt *acolidx = a->colidx; 326 PetscInt i, j; 327 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 328 __m512d vec_x, vec_y, vec_vals; 329 __m256i vec_idx; 330 __mmask8 mask; 331 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 332 __m256i vec_idx2, vec_idx3, vec_idx4; 333 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 334 __m128i vec_idx; 335 __m256d vec_x, vec_y, vec_y2, vec_vals; 336 MatScalar yval; 337 PetscInt r, rows_left, row, nnz_in_row; 338 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 339 __m128d vec_x_tmp; 340 __m256d vec_x, vec_y, vec_y2, vec_vals; 341 MatScalar yval; 342 PetscInt r, rows_left, row, nnz_in_row; 343 #else 344 PetscInt k, sliceheight = a->sliceheight; 345 PetscScalar *sum; 346 #endif 347 348 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 349 #pragma disjoint(*x, *y, *aval) 350 #endif 351 352 PetscFunctionBegin; 353 PetscCall(VecGetArrayRead(xx, &x)); 354 PetscCall(VecGetArray(yy, &y)); 355 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 356 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 357 for (i = 0; i < totalslices; i++) { /* loop over slices */ 358 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 359 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 360 361 vec_y = _mm512_setzero_pd(); 362 vec_y2 = _mm512_setzero_pd(); 363 vec_y3 = _mm512_setzero_pd(); 364 vec_y4 = _mm512_setzero_pd(); 365 366 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 367 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 368 case 3: 369 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 370 acolidx += 8; 371 aval += 8; 372 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 373 acolidx += 8; 374 aval += 8; 375 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 376 acolidx += 8; 377 aval += 8; 378 j += 3; 379 break; 380 case 2: 381 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 382 acolidx += 8; 383 aval += 8; 384 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 385 acolidx += 8; 386 aval += 8; 387 j += 2; 388 break; 389 case 1: 390 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 391 acolidx += 8; 392 aval += 8; 393 j += 1; 394 break; 395 } 396 #pragma novector 397 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 398 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 399 acolidx += 8; 400 aval += 8; 401 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 402 acolidx += 8; 403 aval += 8; 404 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 405 acolidx += 8; 406 aval += 8; 407 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 408 acolidx += 8; 409 aval += 8; 410 } 411 412 vec_y = _mm512_add_pd(vec_y, vec_y2); 413 vec_y = _mm512_add_pd(vec_y, vec_y3); 414 vec_y = _mm512_add_pd(vec_y, vec_y4); 415 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 416 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 417 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 418 } else { 419 _mm512_storeu_pd(&y[8 * i], vec_y); 420 } 421 } 422 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 423 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 424 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 425 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 426 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 427 428 /* last slice may have padding rows. Don't use vectorization. */ 429 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 430 rows_left = A->rmap->n - 8 * i; 431 for (r = 0; r < rows_left; ++r) { 432 yval = (MatScalar)0; 433 row = 8 * i + r; 434 nnz_in_row = a->rlen[row]; 435 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 436 y[row] = yval; 437 } 438 break; 439 } 440 441 vec_y = _mm256_setzero_pd(); 442 vec_y2 = _mm256_setzero_pd(); 443 444 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 445 #pragma novector 446 #pragma unroll(2) 447 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 448 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 449 aval += 4; 450 acolidx += 4; 451 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 452 aval += 4; 453 acolidx += 4; 454 } 455 456 _mm256_storeu_pd(y + i * 8, vec_y); 457 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 458 } 459 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 460 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 461 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 462 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 463 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 464 465 vec_y = _mm256_setzero_pd(); 466 vec_y2 = _mm256_setzero_pd(); 467 468 /* last slice may have padding rows. Don't use vectorization. */ 469 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 470 rows_left = A->rmap->n - 8 * i; 471 for (r = 0; r < rows_left; ++r) { 472 yval = (MatScalar)0; 473 row = 8 * i + r; 474 nnz_in_row = a->rlen[row]; 475 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 476 y[row] = yval; 477 } 478 break; 479 } 480 481 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 482 #pragma novector 483 #pragma unroll(2) 484 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 485 vec_vals = _mm256_loadu_pd(aval); 486 vec_x_tmp = _mm_setzero_pd(); 487 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 488 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 489 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 490 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 491 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 492 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 493 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 494 aval += 4; 495 496 vec_vals = _mm256_loadu_pd(aval); 497 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 498 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 499 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 500 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 501 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 502 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 503 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 504 aval += 4; 505 } 506 507 _mm256_storeu_pd(y + i * 8, vec_y); 508 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 509 } 510 #else 511 PetscCall(PetscMalloc1(sliceheight, &sum)); 512 for (i = 0; i < totalslices; i++) { /* loop over slices */ 513 for (j = 0; j < sliceheight; j++) { 514 sum[j] = 0.0; 515 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 516 } 517 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */ 518 for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j]; 519 } else { 520 for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j]; 521 } 522 } 523 PetscCall(PetscFree(sum)); 524 #endif 525 526 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 527 PetscCall(VecRestoreArrayRead(xx, &x)); 528 PetscCall(VecRestoreArray(yy, &y)); 529 PetscFunctionReturn(PETSC_SUCCESS); 530 } 531 532 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 533 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 534 { 535 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 536 PetscScalar *y, *z; 537 const PetscScalar *x; 538 const MatScalar *aval = a->val; 539 PetscInt totalslices = a->totalslices; 540 const PetscInt *acolidx = a->colidx; 541 PetscInt i, j; 542 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 543 __m512d vec_x, vec_y, vec_vals; 544 __m256i vec_idx; 545 __mmask8 mask = 0; 546 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 547 __m256i vec_idx2, vec_idx3, vec_idx4; 548 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 549 __m128d vec_x_tmp; 550 __m256d vec_x, vec_y, vec_y2, vec_vals; 551 MatScalar yval; 552 PetscInt r, row, nnz_in_row; 553 #else 554 PetscInt k, sliceheight = a->sliceheight; 555 PetscScalar *sum; 556 #endif 557 558 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 559 #pragma disjoint(*x, *y, *aval) 560 #endif 561 562 PetscFunctionBegin; 563 if (!a->nz) { 564 PetscCall(VecCopy(yy, zz)); 565 PetscFunctionReturn(PETSC_SUCCESS); 566 } 567 PetscCall(VecGetArrayRead(xx, &x)); 568 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 569 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 570 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 571 for (i = 0; i < totalslices; i++) { /* loop over slices */ 572 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 573 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 574 575 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 576 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 577 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 578 } else { 579 vec_y = _mm512_loadu_pd(&y[8 * i]); 580 } 581 vec_y2 = _mm512_setzero_pd(); 582 vec_y3 = _mm512_setzero_pd(); 583 vec_y4 = _mm512_setzero_pd(); 584 585 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 586 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 587 case 3: 588 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 589 acolidx += 8; 590 aval += 8; 591 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 592 acolidx += 8; 593 aval += 8; 594 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 595 acolidx += 8; 596 aval += 8; 597 j += 3; 598 break; 599 case 2: 600 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 601 acolidx += 8; 602 aval += 8; 603 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 604 acolidx += 8; 605 aval += 8; 606 j += 2; 607 break; 608 case 1: 609 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 610 acolidx += 8; 611 aval += 8; 612 j += 1; 613 break; 614 } 615 #pragma novector 616 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 617 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 618 acolidx += 8; 619 aval += 8; 620 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 621 acolidx += 8; 622 aval += 8; 623 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 624 acolidx += 8; 625 aval += 8; 626 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 627 acolidx += 8; 628 aval += 8; 629 } 630 631 vec_y = _mm512_add_pd(vec_y, vec_y2); 632 vec_y = _mm512_add_pd(vec_y, vec_y3); 633 vec_y = _mm512_add_pd(vec_y, vec_y4); 634 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 635 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 636 } else { 637 _mm512_storeu_pd(&z[8 * i], vec_y); 638 } 639 } 640 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 641 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 642 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 643 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 644 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 645 646 /* last slice may have padding rows. Don't use vectorization. */ 647 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 648 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 649 row = 8 * i + r; 650 yval = (MatScalar)0.0; 651 nnz_in_row = a->rlen[row]; 652 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 653 z[row] = y[row] + yval; 654 } 655 break; 656 } 657 658 vec_y = _mm256_loadu_pd(y + 8 * i); 659 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 660 661 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 662 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 663 vec_vals = _mm256_loadu_pd(aval); 664 vec_x_tmp = _mm_setzero_pd(); 665 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 666 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 667 vec_x = _mm256_setzero_pd(); 668 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 669 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 670 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 671 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 672 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 673 aval += 4; 674 675 vec_vals = _mm256_loadu_pd(aval); 676 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 677 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 678 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 679 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 680 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 681 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 682 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 683 aval += 4; 684 } 685 686 _mm256_storeu_pd(z + i * 8, vec_y); 687 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 688 } 689 #else 690 PetscCall(PetscMalloc1(sliceheight, &sum)); 691 for (i = 0; i < totalslices; i++) { /* loop over slices */ 692 for (j = 0; j < sliceheight; j++) { 693 sum[j] = 0.0; 694 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 695 } 696 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 697 for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 698 } else { 699 for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 700 } 701 } 702 PetscCall(PetscFree(sum)); 703 #endif 704 705 PetscCall(PetscLogFlops(2.0 * a->nz)); 706 PetscCall(VecRestoreArrayRead(xx, &x)); 707 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 708 PetscFunctionReturn(PETSC_SUCCESS); 709 } 710 711 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 712 { 713 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 714 PetscScalar *y; 715 const PetscScalar *x; 716 const MatScalar *aval = a->val; 717 const PetscInt *acolidx = a->colidx; 718 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight; 719 720 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 721 #pragma disjoint(*x, *y, *aval) 722 #endif 723 724 PetscFunctionBegin; 725 if (A->symmetric == PETSC_BOOL3_TRUE) { 726 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 727 PetscFunctionReturn(PETSC_SUCCESS); 728 } 729 if (zz != yy) PetscCall(VecCopy(zz, yy)); 730 731 if (a->nz) { 732 PetscCall(VecGetArrayRead(xx, &x)); 733 PetscCall(VecGetArray(yy, &y)); 734 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 735 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 736 for (r = 0; r < (A->rmap->n % sliceheight); ++r) { 737 row = sliceheight * i + r; 738 nnz_in_row = a->rlen[row]; 739 for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row]; 740 } 741 break; 742 } 743 for (r = 0; r < sliceheight; ++r) 744 for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r]; 745 } 746 PetscCall(PetscLogFlops(2.0 * a->nz)); 747 PetscCall(VecRestoreArrayRead(xx, &x)); 748 PetscCall(VecRestoreArray(yy, &y)); 749 } 750 PetscFunctionReturn(PETSC_SUCCESS); 751 } 752 753 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 754 { 755 PetscFunctionBegin; 756 if (A->symmetric == PETSC_BOOL3_TRUE) { 757 PetscCall(MatMult_SeqSELL(A, xx, yy)); 758 } else { 759 PetscCall(VecSet(yy, 0.0)); 760 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 761 } 762 PetscFunctionReturn(PETSC_SUCCESS); 763 } 764 765 /* 766 Checks for missing diagonals 767 */ 768 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 769 { 770 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 771 PetscInt *diag, i; 772 773 PetscFunctionBegin; 774 *missing = PETSC_FALSE; 775 if (A->rmap->n > 0 && !(a->colidx)) { 776 *missing = PETSC_TRUE; 777 if (d) *d = 0; 778 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 779 } else { 780 diag = a->diag; 781 for (i = 0; i < A->rmap->n; i++) { 782 if (diag[i] == -1) { 783 *missing = PETSC_TRUE; 784 if (d) *d = i; 785 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 786 break; 787 } 788 } 789 } 790 PetscFunctionReturn(PETSC_SUCCESS); 791 } 792 793 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 794 { 795 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 796 PetscInt i, j, m = A->rmap->n, shift; 797 798 PetscFunctionBegin; 799 if (!a->diag) { 800 PetscCall(PetscMalloc1(m, &a->diag)); 801 a->free_diag = PETSC_TRUE; 802 } 803 for (i = 0; i < m; i++) { /* loop over rows */ 804 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 805 a->diag[i] = -1; 806 for (j = 0; j < a->rlen[i]; j++) { 807 if (a->colidx[shift + a->sliceheight * j] == i) { 808 a->diag[i] = shift + a->sliceheight * j; 809 break; 810 } 811 } 812 } 813 PetscFunctionReturn(PETSC_SUCCESS); 814 } 815 816 /* 817 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 818 */ 819 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 820 { 821 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 822 PetscInt i, *diag, m = A->rmap->n; 823 MatScalar *val = a->val; 824 PetscScalar *idiag, *mdiag; 825 826 PetscFunctionBegin; 827 if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS); 828 PetscCall(MatMarkDiagonal_SeqSELL(A)); 829 diag = a->diag; 830 if (!a->idiag) { 831 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 832 val = a->val; 833 } 834 mdiag = a->mdiag; 835 idiag = a->idiag; 836 837 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 838 for (i = 0; i < m; i++) { 839 mdiag[i] = val[diag[i]]; 840 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 841 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 842 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 843 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 844 A->factorerror_zeropivot_value = 0.0; 845 A->factorerror_zeropivot_row = i; 846 } 847 idiag[i] = 1.0 / val[diag[i]]; 848 } 849 PetscCall(PetscLogFlops(m)); 850 } else { 851 for (i = 0; i < m; i++) { 852 mdiag[i] = val[diag[i]]; 853 idiag[i] = omega / (fshift + val[diag[i]]); 854 } 855 PetscCall(PetscLogFlops(2.0 * m)); 856 } 857 a->idiagvalid = PETSC_TRUE; 858 PetscFunctionReturn(PETSC_SUCCESS); 859 } 860 861 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 862 { 863 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 864 865 PetscFunctionBegin; 866 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 867 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 868 PetscFunctionReturn(PETSC_SUCCESS); 869 } 870 871 PetscErrorCode MatDestroy_SeqSELL(Mat A) 872 { 873 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 874 875 PetscFunctionBegin; 876 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 877 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 878 PetscCall(ISDestroy(&a->row)); 879 PetscCall(ISDestroy(&a->col)); 880 PetscCall(PetscFree(a->diag)); 881 PetscCall(PetscFree(a->rlen)); 882 PetscCall(PetscFree(a->sliidx)); 883 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 884 PetscCall(PetscFree(a->solve_work)); 885 PetscCall(ISDestroy(&a->icol)); 886 PetscCall(PetscFree(a->saved_values)); 887 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 888 PetscCall(PetscFree(A->data)); 889 #if defined(PETSC_HAVE_CUDA) 890 PetscCall(PetscFree(a->chunk_slice_map)); 891 #endif 892 893 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 894 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 895 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 896 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 897 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 898 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 899 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 900 #if defined(PETSC_HAVE_CUDA) 901 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL)); 902 #endif 903 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL)); 904 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL)); 905 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL)); 906 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL)); 907 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL)); 908 PetscFunctionReturn(PETSC_SUCCESS); 909 } 910 911 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 912 { 913 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 914 915 PetscFunctionBegin; 916 switch (op) { 917 case MAT_ROW_ORIENTED: 918 a->roworiented = flg; 919 break; 920 case MAT_KEEP_NONZERO_PATTERN: 921 a->keepnonzeropattern = flg; 922 break; 923 case MAT_NEW_NONZERO_LOCATIONS: 924 a->nonew = (flg ? 0 : 1); 925 break; 926 case MAT_NEW_NONZERO_LOCATION_ERR: 927 a->nonew = (flg ? -1 : 0); 928 break; 929 case MAT_NEW_NONZERO_ALLOCATION_ERR: 930 a->nonew = (flg ? -2 : 0); 931 break; 932 case MAT_UNUSED_NONZERO_LOCATION_ERR: 933 a->nounused = (flg ? -1 : 0); 934 break; 935 case MAT_FORCE_DIAGONAL_ENTRIES: 936 case MAT_IGNORE_OFF_PROC_ENTRIES: 937 case MAT_USE_HASH_TABLE: 938 case MAT_SORTED_FULL: 939 PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); 940 break; 941 case MAT_SPD: 942 case MAT_SYMMETRIC: 943 case MAT_STRUCTURALLY_SYMMETRIC: 944 case MAT_HERMITIAN: 945 case MAT_SYMMETRY_ETERNAL: 946 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 947 case MAT_SPD_ETERNAL: 948 /* These options are handled directly by MatSetOption() */ 949 break; 950 default: 951 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 952 } 953 PetscFunctionReturn(PETSC_SUCCESS); 954 } 955 956 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 957 { 958 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 959 PetscInt i, j, n, shift; 960 PetscScalar *x, zero = 0.0; 961 962 PetscFunctionBegin; 963 PetscCall(VecGetLocalSize(v, &n)); 964 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 965 966 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 967 PetscInt *diag = a->diag; 968 PetscCall(VecGetArray(v, &x)); 969 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 970 PetscCall(VecRestoreArray(v, &x)); 971 PetscFunctionReturn(PETSC_SUCCESS); 972 } 973 974 PetscCall(VecSet(v, zero)); 975 PetscCall(VecGetArray(v, &x)); 976 for (i = 0; i < n; i++) { /* loop over rows */ 977 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 978 x[i] = 0; 979 for (j = 0; j < a->rlen[i]; j++) { 980 if (a->colidx[shift + a->sliceheight * j] == i) { 981 x[i] = a->val[shift + a->sliceheight * j]; 982 break; 983 } 984 } 985 } 986 PetscCall(VecRestoreArray(v, &x)); 987 PetscFunctionReturn(PETSC_SUCCESS); 988 } 989 990 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 991 { 992 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 993 const PetscScalar *l, *r; 994 PetscInt i, j, m, n, row; 995 996 PetscFunctionBegin; 997 if (ll) { 998 /* The local size is used so that VecMPI can be passed to this routine 999 by MatDiagonalScale_MPISELL */ 1000 PetscCall(VecGetLocalSize(ll, &m)); 1001 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1002 PetscCall(VecGetArrayRead(ll, &l)); 1003 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1004 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1005 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1006 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row]; 1007 } 1008 } else { 1009 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; } 1010 } 1011 } 1012 PetscCall(VecRestoreArrayRead(ll, &l)); 1013 PetscCall(PetscLogFlops(a->nz)); 1014 } 1015 if (rr) { 1016 PetscCall(VecGetLocalSize(rr, &n)); 1017 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1018 PetscCall(VecGetArrayRead(rr, &r)); 1019 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1020 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1021 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) { 1022 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]]; 1023 } 1024 } else { 1025 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 1026 } 1027 } 1028 PetscCall(VecRestoreArrayRead(rr, &r)); 1029 PetscCall(PetscLogFlops(a->nz)); 1030 } 1031 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1032 #if defined(PETSC_HAVE_CUDA) 1033 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1034 #endif 1035 PetscFunctionReturn(PETSC_SUCCESS); 1036 } 1037 1038 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1039 { 1040 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1041 PetscInt *cp, i, k, low, high, t, row, col, l; 1042 PetscInt shift; 1043 MatScalar *vp; 1044 1045 PetscFunctionBegin; 1046 for (k = 0; k < m; k++) { /* loop over requested rows */ 1047 row = im[k]; 1048 if (row < 0) continue; 1049 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1050 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */ 1051 cp = a->colidx + shift; /* pointer to the row */ 1052 vp = a->val + shift; /* pointer to the row */ 1053 for (l = 0; l < n; l++) { /* loop over requested columns */ 1054 col = in[l]; 1055 if (col < 0) continue; 1056 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1057 high = a->rlen[row]; 1058 low = 0; /* assume unsorted */ 1059 while (high - low > 5) { 1060 t = (low + high) / 2; 1061 if (*(cp + a->sliceheight * t) > col) high = t; 1062 else low = t; 1063 } 1064 for (i = low; i < high; i++) { 1065 if (*(cp + a->sliceheight * i) > col) break; 1066 if (*(cp + a->sliceheight * i) == col) { 1067 *v++ = *(vp + a->sliceheight * i); 1068 goto finished; 1069 } 1070 } 1071 *v++ = 0.0; 1072 finished:; 1073 } 1074 } 1075 PetscFunctionReturn(PETSC_SUCCESS); 1076 } 1077 1078 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1079 { 1080 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1081 PetscInt i, j, m = A->rmap->n, shift; 1082 const char *name; 1083 PetscViewerFormat format; 1084 1085 PetscFunctionBegin; 1086 PetscCall(PetscViewerGetFormat(viewer, &format)); 1087 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1088 PetscInt nofinalvalue = 0; 1089 /* 1090 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1091 nofinalvalue = 1; 1092 } 1093 */ 1094 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1095 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1096 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1097 #if defined(PETSC_USE_COMPLEX) 1098 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1099 #else 1100 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1101 #endif 1102 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1103 1104 for (i = 0; i < m; i++) { 1105 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1106 for (j = 0; j < a->rlen[i]; j++) { 1107 #if defined(PETSC_USE_COMPLEX) 1108 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1109 #else 1110 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j])); 1111 #endif 1112 } 1113 } 1114 /* 1115 if (nofinalvalue) { 1116 #if defined(PETSC_USE_COMPLEX) 1117 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1118 #else 1119 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1120 #endif 1121 } 1122 */ 1123 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1124 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1125 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1126 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1127 PetscFunctionReturn(PETSC_SUCCESS); 1128 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1129 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1130 for (i = 0; i < m; i++) { 1131 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1132 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1133 for (j = 0; j < a->rlen[i]; j++) { 1134 #if defined(PETSC_USE_COMPLEX) 1135 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1136 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1137 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1138 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1139 } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1140 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1141 } 1142 #else 1143 if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1144 #endif 1145 } 1146 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1147 } 1148 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1149 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1150 PetscInt cnt = 0, jcnt; 1151 PetscScalar value; 1152 #if defined(PETSC_USE_COMPLEX) 1153 PetscBool realonly = PETSC_TRUE; 1154 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1155 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1156 realonly = PETSC_FALSE; 1157 break; 1158 } 1159 } 1160 #endif 1161 1162 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1163 for (i = 0; i < m; i++) { 1164 jcnt = 0; 1165 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1166 for (j = 0; j < A->cmap->n; j++) { 1167 if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) { 1168 value = a->val[cnt++]; 1169 jcnt++; 1170 } else { 1171 value = 0.0; 1172 } 1173 #if defined(PETSC_USE_COMPLEX) 1174 if (realonly) { 1175 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1176 } else { 1177 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1178 } 1179 #else 1180 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1181 #endif 1182 } 1183 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1184 } 1185 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1186 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1187 PetscInt fshift = 1; 1188 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1189 #if defined(PETSC_USE_COMPLEX) 1190 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1191 #else 1192 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1193 #endif 1194 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1195 for (i = 0; i < m; i++) { 1196 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1197 for (j = 0; j < a->rlen[i]; j++) { 1198 #if defined(PETSC_USE_COMPLEX) 1199 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1200 #else 1201 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j])); 1202 #endif 1203 } 1204 } 1205 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1206 } else if (format == PETSC_VIEWER_NATIVE) { 1207 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1208 PetscInt row; 1209 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1210 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1211 #if defined(PETSC_USE_COMPLEX) 1212 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1213 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1214 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1215 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1216 } else { 1217 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1218 } 1219 #else 1220 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j])); 1221 #endif 1222 } 1223 } 1224 } else { 1225 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1226 if (A->factortype) { 1227 for (i = 0; i < m; i++) { 1228 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1229 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1230 /* L part */ 1231 for (j = shift; j < a->diag[i]; j += a->sliceheight) { 1232 #if defined(PETSC_USE_COMPLEX) 1233 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) { 1234 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1235 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) { 1236 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1237 } else { 1238 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1239 } 1240 #else 1241 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1242 #endif 1243 } 1244 /* diagonal */ 1245 j = a->diag[i]; 1246 #if defined(PETSC_USE_COMPLEX) 1247 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1248 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1249 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1250 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1251 } else { 1252 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1253 } 1254 #else 1255 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1256 #endif 1257 1258 /* U part */ 1259 for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) { 1260 #if defined(PETSC_USE_COMPLEX) 1261 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1262 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1263 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1264 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1265 } else { 1266 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1267 } 1268 #else 1269 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1270 #endif 1271 } 1272 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1273 } 1274 } else { 1275 for (i = 0; i < m; i++) { 1276 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1277 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1278 for (j = 0; j < a->rlen[i]; j++) { 1279 #if defined(PETSC_USE_COMPLEX) 1280 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1281 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1282 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1283 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1284 } else { 1285 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1286 } 1287 #else 1288 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1289 #endif 1290 } 1291 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1292 } 1293 } 1294 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1295 } 1296 PetscCall(PetscViewerFlush(viewer)); 1297 PetscFunctionReturn(PETSC_SUCCESS); 1298 } 1299 1300 #include <petscdraw.h> 1301 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1302 { 1303 Mat A = (Mat)Aa; 1304 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1305 PetscInt i, j, m = A->rmap->n, shift; 1306 int color; 1307 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1308 PetscViewer viewer; 1309 PetscViewerFormat format; 1310 1311 PetscFunctionBegin; 1312 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1313 PetscCall(PetscViewerGetFormat(viewer, &format)); 1314 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1315 1316 /* loop over matrix elements drawing boxes */ 1317 1318 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1319 PetscDrawCollectiveBegin(draw); 1320 /* Blue for negative, Cyan for zero and Red for positive */ 1321 color = PETSC_DRAW_BLUE; 1322 for (i = 0; i < m; i++) { 1323 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1324 y_l = m - i - 1.0; 1325 y_r = y_l + 1.0; 1326 for (j = 0; j < a->rlen[i]; j++) { 1327 x_l = a->colidx[shift + a->sliceheight * j]; 1328 x_r = x_l + 1.0; 1329 if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue; 1330 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1331 } 1332 } 1333 color = PETSC_DRAW_CYAN; 1334 for (i = 0; i < m; i++) { 1335 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1336 y_l = m - i - 1.0; 1337 y_r = y_l + 1.0; 1338 for (j = 0; j < a->rlen[i]; j++) { 1339 x_l = a->colidx[shift + a->sliceheight * j]; 1340 x_r = x_l + 1.0; 1341 if (a->val[shift + a->sliceheight * j] != 0.) continue; 1342 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1343 } 1344 } 1345 color = PETSC_DRAW_RED; 1346 for (i = 0; i < m; i++) { 1347 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1348 y_l = m - i - 1.0; 1349 y_r = y_l + 1.0; 1350 for (j = 0; j < a->rlen[i]; j++) { 1351 x_l = a->colidx[shift + a->sliceheight * j]; 1352 x_r = x_l + 1.0; 1353 if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue; 1354 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1355 } 1356 } 1357 PetscDrawCollectiveEnd(draw); 1358 } else { 1359 /* use contour shading to indicate magnitude of values */ 1360 /* first determine max of all nonzero values */ 1361 PetscReal minv = 0.0, maxv = 0.0; 1362 PetscInt count = 0; 1363 PetscDraw popup; 1364 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1365 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1366 } 1367 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1368 PetscCall(PetscDrawGetPopup(draw, &popup)); 1369 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1370 1371 PetscDrawCollectiveBegin(draw); 1372 for (i = 0; i < m; i++) { 1373 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1374 y_l = m - i - 1.0; 1375 y_r = y_l + 1.0; 1376 for (j = 0; j < a->rlen[i]; j++) { 1377 x_l = a->colidx[shift + a->sliceheight * j]; 1378 x_r = x_l + 1.0; 1379 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1380 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1381 count++; 1382 } 1383 } 1384 PetscDrawCollectiveEnd(draw); 1385 } 1386 PetscFunctionReturn(PETSC_SUCCESS); 1387 } 1388 1389 #include <petscdraw.h> 1390 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1391 { 1392 PetscDraw draw; 1393 PetscReal xr, yr, xl, yl, h, w; 1394 PetscBool isnull; 1395 1396 PetscFunctionBegin; 1397 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1398 PetscCall(PetscDrawIsNull(draw, &isnull)); 1399 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1400 1401 xr = A->cmap->n; 1402 yr = A->rmap->n; 1403 h = yr / 10.0; 1404 w = xr / 10.0; 1405 xr += w; 1406 yr += h; 1407 xl = -w; 1408 yl = -h; 1409 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1410 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1411 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1412 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1413 PetscCall(PetscDrawSave(draw)); 1414 PetscFunctionReturn(PETSC_SUCCESS); 1415 } 1416 1417 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1418 { 1419 PetscBool iascii, isbinary, isdraw; 1420 1421 PetscFunctionBegin; 1422 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1423 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1424 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1425 if (iascii) { 1426 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1427 } else if (isbinary) { 1428 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1429 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1430 PetscFunctionReturn(PETSC_SUCCESS); 1431 } 1432 1433 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1434 { 1435 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1436 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1437 MatScalar *vp; 1438 #if defined(PETSC_HAVE_CUDA) 1439 PetscInt totalchunks = 0; 1440 #endif 1441 1442 PetscFunctionBegin; 1443 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1444 /* To do: compress out the unused elements */ 1445 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1446 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1447 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1448 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1449 a->nonzerorowcnt = 0; 1450 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1451 for (i = 0; i < a->totalslices; ++i) { 1452 shift = a->sliidx[i]; /* starting index of the slice */ 1453 cp = a->colidx + shift; /* pointer to the column indices of the slice */ 1454 vp = a->val + shift; /* pointer to the nonzero values of the slice */ 1455 for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */ 1456 row = a->sliceheight * i + row_in_slice; 1457 nrow = a->rlen[row]; /* number of nonzeros in row */ 1458 /* 1459 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1460 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1461 */ 1462 lastcol = 0; 1463 if (nrow > 0) { /* nonempty row */ 1464 a->nonzerorowcnt++; 1465 lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1466 } else if (!row_in_slice) { /* first row of the correct slice is empty */ 1467 for (j = 1; j < a->sliceheight; j++) { 1468 if (a->rlen[a->sliceheight * i + j]) { 1469 lastcol = cp[j]; 1470 break; 1471 } 1472 } 1473 } else { 1474 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1475 } 1476 1477 for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) { 1478 cp[a->sliceheight * k + row_in_slice] = lastcol; 1479 vp[a->sliceheight * k + row_in_slice] = (MatScalar)0; 1480 } 1481 } 1482 } 1483 1484 A->info.mallocs += a->reallocs; 1485 a->reallocs = 0; 1486 1487 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1488 #if defined(PETSC_HAVE_CUDA) 1489 if (!a->chunksize && a->totalslices) { 1490 a->chunksize = 64; 1491 while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2; 1492 totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize; 1493 } 1494 if (totalchunks != a->totalchunks) { 1495 PetscCall(PetscFree(a->chunk_slice_map)); 1496 PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map)); 1497 a->totalchunks = totalchunks; 1498 } 1499 j = 0; 1500 for (i = 0; i < totalchunks; i++) { 1501 while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++; 1502 a->chunk_slice_map[i] = j; 1503 } 1504 #endif 1505 PetscFunctionReturn(PETSC_SUCCESS); 1506 } 1507 1508 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1509 { 1510 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1511 1512 PetscFunctionBegin; 1513 info->block_size = 1.0; 1514 info->nz_allocated = a->maxallocmat; 1515 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1516 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1517 info->assemblies = A->num_ass; 1518 info->mallocs = A->info.mallocs; 1519 info->memory = 0; /* REVIEW ME */ 1520 if (A->factortype) { 1521 info->fill_ratio_given = A->info.fill_ratio_given; 1522 info->fill_ratio_needed = A->info.fill_ratio_needed; 1523 info->factor_mallocs = A->info.factor_mallocs; 1524 } else { 1525 info->fill_ratio_given = 0; 1526 info->fill_ratio_needed = 0; 1527 info->factor_mallocs = 0; 1528 } 1529 PetscFunctionReturn(PETSC_SUCCESS); 1530 } 1531 1532 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1533 { 1534 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1535 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1536 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1537 MatScalar *vp, value; 1538 #if defined(PETSC_HAVE_CUDA) 1539 PetscBool inserted = PETSC_FALSE; 1540 PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight; 1541 #endif 1542 1543 PetscFunctionBegin; 1544 for (k = 0; k < m; k++) { /* loop over added rows */ 1545 row = im[k]; 1546 if (row < 0) continue; 1547 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1548 shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */ 1549 cp = a->colidx + shift; /* pointer to the row */ 1550 vp = a->val + shift; /* pointer to the row */ 1551 nrow = a->rlen[row]; 1552 low = 0; 1553 high = nrow; 1554 1555 for (l = 0; l < n; l++) { /* loop over added columns */ 1556 col = in[l]; 1557 if (col < 0) continue; 1558 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1559 if (a->roworiented) { 1560 value = v[l + k * n]; 1561 } else { 1562 value = v[k + l * m]; 1563 } 1564 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1565 1566 /* search in this row for the specified column, i indicates the column to be set */ 1567 if (col <= lastcol) low = 0; 1568 else high = nrow; 1569 lastcol = col; 1570 while (high - low > 5) { 1571 t = (low + high) / 2; 1572 if (*(cp + a->sliceheight * t) > col) high = t; 1573 else low = t; 1574 } 1575 for (i = low; i < high; i++) { 1576 if (*(cp + a->sliceheight * i) > col) break; 1577 if (*(cp + a->sliceheight * i) == col) { 1578 if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value; 1579 else *(vp + a->sliceheight * i) = value; 1580 #if defined(PETSC_HAVE_CUDA) 1581 inserted = PETSC_TRUE; 1582 #endif 1583 low = i + 1; 1584 goto noinsert; 1585 } 1586 } 1587 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1588 if (nonew == 1) goto noinsert; 1589 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1590 #if defined(PETSC_HAVE_CUDA) 1591 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul); 1592 #else 1593 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1594 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1); 1595 #endif 1596 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1597 for (ii = nrow - 1; ii >= i; ii--) { 1598 *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); 1599 *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); 1600 } 1601 a->rlen[row]++; 1602 *(cp + a->sliceheight * i) = col; 1603 *(vp + a->sliceheight * i) = value; 1604 a->nz++; 1605 A->nonzerostate++; 1606 #if defined(PETSC_HAVE_CUDA) 1607 inserted = PETSC_TRUE; 1608 #endif 1609 low = i + 1; 1610 high++; 1611 nrow++; 1612 noinsert:; 1613 } 1614 a->rlen[row] = nrow; 1615 } 1616 #if defined(PETSC_HAVE_CUDA) 1617 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU; 1618 #endif 1619 PetscFunctionReturn(PETSC_SUCCESS); 1620 } 1621 1622 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1623 { 1624 PetscFunctionBegin; 1625 /* If the two matrices have the same copy implementation, use fast copy. */ 1626 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1627 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1628 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1629 1630 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1631 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1632 } else { 1633 PetscCall(MatCopy_Basic(A, B, str)); 1634 } 1635 PetscFunctionReturn(PETSC_SUCCESS); 1636 } 1637 1638 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1639 { 1640 PetscFunctionBegin; 1641 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1642 PetscFunctionReturn(PETSC_SUCCESS); 1643 } 1644 1645 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1646 { 1647 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1648 1649 PetscFunctionBegin; 1650 *array = a->val; 1651 PetscFunctionReturn(PETSC_SUCCESS); 1652 } 1653 1654 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1655 { 1656 PetscFunctionBegin; 1657 PetscFunctionReturn(PETSC_SUCCESS); 1658 } 1659 1660 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1661 { 1662 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1663 MatScalar *aval = a->val; 1664 PetscScalar oalpha = alpha; 1665 PetscBLASInt one = 1, size; 1666 1667 PetscFunctionBegin; 1668 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1669 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1670 PetscCall(PetscLogFlops(a->nz)); 1671 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1672 #if defined(PETSC_HAVE_CUDA) 1673 if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU; 1674 #endif 1675 PetscFunctionReturn(PETSC_SUCCESS); 1676 } 1677 1678 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1679 { 1680 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1681 1682 PetscFunctionBegin; 1683 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1684 PetscCall(MatShift_Basic(Y, a)); 1685 PetscFunctionReturn(PETSC_SUCCESS); 1686 } 1687 1688 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1689 { 1690 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1691 PetscScalar *x, sum, *t; 1692 const MatScalar *idiag = NULL, *mdiag; 1693 const PetscScalar *b, *xb; 1694 PetscInt n, m = A->rmap->n, i, j, shift; 1695 const PetscInt *diag; 1696 1697 PetscFunctionBegin; 1698 its = its * lits; 1699 1700 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1701 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1702 a->fshift = fshift; 1703 a->omega = omega; 1704 1705 diag = a->diag; 1706 t = a->ssor_work; 1707 idiag = a->idiag; 1708 mdiag = a->mdiag; 1709 1710 PetscCall(VecGetArray(xx, &x)); 1711 PetscCall(VecGetArrayRead(bb, &b)); 1712 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1713 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1714 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1715 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1716 1717 if (flag & SOR_ZERO_INITIAL_GUESS) { 1718 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1719 for (i = 0; i < m; i++) { 1720 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1721 sum = b[i]; 1722 n = (diag[i] - shift) / a->sliceheight; 1723 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1724 t[i] = sum; 1725 x[i] = sum * idiag[i]; 1726 } 1727 xb = t; 1728 PetscCall(PetscLogFlops(a->nz)); 1729 } else xb = b; 1730 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1731 for (i = m - 1; i >= 0; i--) { 1732 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1733 sum = xb[i]; 1734 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1735 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1736 if (xb == b) { 1737 x[i] = sum * idiag[i]; 1738 } else { 1739 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1740 } 1741 } 1742 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1743 } 1744 its--; 1745 } 1746 while (its--) { 1747 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1748 for (i = 0; i < m; i++) { 1749 /* lower */ 1750 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1751 sum = b[i]; 1752 n = (diag[i] - shift) / a->sliceheight; 1753 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1754 t[i] = sum; /* save application of the lower-triangular part */ 1755 /* upper */ 1756 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1757 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1758 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1759 } 1760 xb = t; 1761 PetscCall(PetscLogFlops(2.0 * a->nz)); 1762 } else xb = b; 1763 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1764 for (i = m - 1; i >= 0; i--) { 1765 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1766 sum = xb[i]; 1767 if (xb == b) { 1768 /* whole matrix (no checkpointing available) */ 1769 n = a->rlen[i]; 1770 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1771 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1772 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1773 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1774 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1775 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1776 } 1777 } 1778 if (xb == b) { 1779 PetscCall(PetscLogFlops(2.0 * a->nz)); 1780 } else { 1781 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1782 } 1783 } 1784 } 1785 PetscCall(VecRestoreArray(xx, &x)); 1786 PetscCall(VecRestoreArrayRead(bb, &b)); 1787 PetscFunctionReturn(PETSC_SUCCESS); 1788 } 1789 1790 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1791 MatGetRow_SeqSELL, 1792 MatRestoreRow_SeqSELL, 1793 MatMult_SeqSELL, 1794 /* 4*/ MatMultAdd_SeqSELL, 1795 MatMultTranspose_SeqSELL, 1796 MatMultTransposeAdd_SeqSELL, 1797 NULL, 1798 NULL, 1799 NULL, 1800 /* 10*/ NULL, 1801 NULL, 1802 NULL, 1803 MatSOR_SeqSELL, 1804 NULL, 1805 /* 15*/ MatGetInfo_SeqSELL, 1806 MatEqual_SeqSELL, 1807 MatGetDiagonal_SeqSELL, 1808 MatDiagonalScale_SeqSELL, 1809 NULL, 1810 /* 20*/ NULL, 1811 MatAssemblyEnd_SeqSELL, 1812 MatSetOption_SeqSELL, 1813 MatZeroEntries_SeqSELL, 1814 /* 24*/ NULL, 1815 NULL, 1816 NULL, 1817 NULL, 1818 NULL, 1819 /* 29*/ MatSetUp_SeqSELL, 1820 NULL, 1821 NULL, 1822 NULL, 1823 NULL, 1824 /* 34*/ MatDuplicate_SeqSELL, 1825 NULL, 1826 NULL, 1827 NULL, 1828 NULL, 1829 /* 39*/ NULL, 1830 NULL, 1831 NULL, 1832 MatGetValues_SeqSELL, 1833 MatCopy_SeqSELL, 1834 /* 44*/ NULL, 1835 MatScale_SeqSELL, 1836 MatShift_SeqSELL, 1837 NULL, 1838 NULL, 1839 /* 49*/ NULL, 1840 NULL, 1841 NULL, 1842 NULL, 1843 NULL, 1844 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1845 NULL, 1846 NULL, 1847 NULL, 1848 NULL, 1849 /* 59*/ NULL, 1850 MatDestroy_SeqSELL, 1851 MatView_SeqSELL, 1852 NULL, 1853 NULL, 1854 /* 64*/ NULL, 1855 NULL, 1856 NULL, 1857 NULL, 1858 NULL, 1859 /* 69*/ NULL, 1860 NULL, 1861 NULL, 1862 NULL, 1863 NULL, 1864 /* 74*/ NULL, 1865 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1866 NULL, 1867 NULL, 1868 NULL, 1869 /* 79*/ NULL, 1870 NULL, 1871 NULL, 1872 NULL, 1873 NULL, 1874 /* 84*/ NULL, 1875 NULL, 1876 NULL, 1877 NULL, 1878 NULL, 1879 /* 89*/ NULL, 1880 NULL, 1881 NULL, 1882 NULL, 1883 NULL, 1884 /* 94*/ NULL, 1885 NULL, 1886 NULL, 1887 NULL, 1888 NULL, 1889 /* 99*/ NULL, 1890 NULL, 1891 NULL, 1892 MatConjugate_SeqSELL, 1893 NULL, 1894 /*104*/ NULL, 1895 NULL, 1896 NULL, 1897 NULL, 1898 NULL, 1899 /*109*/ NULL, 1900 NULL, 1901 NULL, 1902 NULL, 1903 MatMissingDiagonal_SeqSELL, 1904 /*114*/ NULL, 1905 NULL, 1906 NULL, 1907 NULL, 1908 NULL, 1909 /*119*/ NULL, 1910 NULL, 1911 NULL, 1912 NULL, 1913 NULL, 1914 /*124*/ NULL, 1915 NULL, 1916 NULL, 1917 NULL, 1918 NULL, 1919 /*129*/ NULL, 1920 NULL, 1921 NULL, 1922 NULL, 1923 NULL, 1924 /*134*/ NULL, 1925 NULL, 1926 NULL, 1927 NULL, 1928 NULL, 1929 /*139*/ NULL, 1930 NULL, 1931 NULL, 1932 MatFDColoringSetUp_SeqXAIJ, 1933 NULL, 1934 /*144*/ NULL, 1935 NULL, 1936 NULL, 1937 NULL, 1938 NULL, 1939 NULL, 1940 /*150*/ NULL, 1941 NULL}; 1942 1943 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1944 { 1945 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1946 1947 PetscFunctionBegin; 1948 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1949 1950 /* allocate space for values if not already there */ 1951 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1952 1953 /* copy values over */ 1954 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1955 PetscFunctionReturn(PETSC_SUCCESS); 1956 } 1957 1958 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1959 { 1960 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1961 1962 PetscFunctionBegin; 1963 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1964 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1965 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1966 PetscFunctionReturn(PETSC_SUCCESS); 1967 } 1968 1969 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio) 1970 { 1971 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1972 1973 PetscFunctionBegin; 1974 if (a->totalslices && a->sliidx[a->totalslices]) { 1975 *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices]; 1976 } else { 1977 *ratio = 0.0; 1978 } 1979 PetscFunctionReturn(PETSC_SUCCESS); 1980 } 1981 1982 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth) 1983 { 1984 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1985 PetscInt i, current_slicewidth; 1986 1987 PetscFunctionBegin; 1988 *slicewidth = 0; 1989 for (i = 0; i < a->totalslices; i++) { 1990 current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight; 1991 if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth; 1992 } 1993 PetscFunctionReturn(PETSC_SUCCESS); 1994 } 1995 1996 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth) 1997 { 1998 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1999 2000 PetscFunctionBegin; 2001 *slicewidth = 0; 2002 if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; } 2003 PetscFunctionReturn(PETSC_SUCCESS); 2004 } 2005 2006 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance) 2007 { 2008 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2009 PetscReal mean; 2010 PetscInt i, totalslices = a->totalslices, *sliidx = a->sliidx; 2011 2012 PetscFunctionBegin; 2013 *variance = 0; 2014 if (totalslices) { 2015 mean = (PetscReal)sliidx[totalslices] / totalslices; 2016 for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; } 2017 } 2018 PetscFunctionReturn(PETSC_SUCCESS); 2019 } 2020 2021 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight) 2022 { 2023 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2024 2025 PetscFunctionBegin; 2026 if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS); 2027 PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight); 2028 a->sliceheight = sliceheight; 2029 #if defined(PETSC_HAVE_CUDA) 2030 PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight); 2031 #endif 2032 PetscFunctionReturn(PETSC_SUCCESS); 2033 } 2034 2035 /*@C 2036 MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix. 2037 2038 Not Collective 2039 2040 Input Parameter: 2041 . A - a MATSEQSELL matrix 2042 2043 Output Parameter: 2044 . ratio - ratio of number of padded zeros to number of allocated elements 2045 2046 Level: intermediate 2047 2048 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2049 @*/ 2050 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio) 2051 { 2052 PetscFunctionBegin; 2053 PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio)); 2054 PetscFunctionReturn(PETSC_SUCCESS); 2055 } 2056 2057 /*@C 2058 MatSeqSELLGetMaxSliceWidth - returns the maximum slice width. 2059 2060 Not Collective 2061 2062 Input Parameter: 2063 . A - a MATSEQSELL matrix 2064 2065 Output Parameter: 2066 . slicewidth - maximum slice width 2067 2068 Level: intermediate 2069 2070 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2071 @*/ 2072 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth) 2073 { 2074 PetscFunctionBegin; 2075 PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth)); 2076 PetscFunctionReturn(PETSC_SUCCESS); 2077 } 2078 2079 /*@C 2080 MatSeqSELLGetAvgSliceWidth - returns the average slice width. 2081 2082 Not Collective 2083 2084 Input Parameter: 2085 . A - a MATSEQSELL matrix 2086 2087 Output Parameter: 2088 . slicewidth - average slice width 2089 2090 Level: intermediate 2091 2092 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()` 2093 @*/ 2094 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth) 2095 { 2096 PetscFunctionBegin; 2097 PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth)); 2098 PetscFunctionReturn(PETSC_SUCCESS); 2099 } 2100 2101 /*@C 2102 MatSeqSELLSetSliceHeight - sets the slice height. 2103 2104 Not Collective 2105 2106 Input Parameters: 2107 + A - a MATSEQSELL matrix 2108 - sliceheight - slice height 2109 2110 Notes: 2111 You cannot change the slice height once it have been set. 2112 2113 The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called. 2114 2115 Level: intermediate 2116 2117 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()` 2118 @*/ 2119 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight) 2120 { 2121 PetscFunctionBegin; 2122 PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight)); 2123 PetscFunctionReturn(PETSC_SUCCESS); 2124 } 2125 2126 /*@C 2127 MatSeqSELLGetVarSliceSize - returns the variance of the slice size. 2128 2129 Not Collective 2130 2131 Input Parameter: 2132 . A - a MATSEQSELL matrix 2133 2134 Output Parameter: 2135 . variance - variance of the slice size 2136 2137 Level: intermediate 2138 2139 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()` 2140 @*/ 2141 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance) 2142 { 2143 PetscFunctionBegin; 2144 PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance)); 2145 PetscFunctionReturn(PETSC_SUCCESS); 2146 } 2147 2148 #if defined(PETSC_HAVE_CUDA) 2149 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 2150 #endif 2151 2152 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 2153 { 2154 Mat_SeqSELL *b; 2155 PetscMPIInt size; 2156 2157 PetscFunctionBegin; 2158 PetscCall(PetscCitationsRegister(citation, &cited)); 2159 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 2160 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 2161 2162 PetscCall(PetscNew(&b)); 2163 2164 B->data = (void *)b; 2165 B->ops[0] = MatOps_Values; 2166 2167 b->row = NULL; 2168 b->col = NULL; 2169 b->icol = NULL; 2170 b->reallocs = 0; 2171 b->ignorezeroentries = PETSC_FALSE; 2172 b->roworiented = PETSC_TRUE; 2173 b->nonew = 0; 2174 b->diag = NULL; 2175 b->solve_work = NULL; 2176 B->spptr = NULL; 2177 b->saved_values = NULL; 2178 b->idiag = NULL; 2179 b->mdiag = NULL; 2180 b->ssor_work = NULL; 2181 b->omega = 1.0; 2182 b->fshift = 0.0; 2183 b->idiagvalid = PETSC_FALSE; 2184 b->keepnonzeropattern = PETSC_FALSE; 2185 b->sliceheight = 0; 2186 2187 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 2188 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 2189 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 2190 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 2191 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 2192 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 2193 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 2194 #if defined(PETSC_HAVE_CUDA) 2195 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA)); 2196 #endif 2197 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL)); 2198 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL)); 2199 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL)); 2200 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL)); 2201 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL)); 2202 2203 PetscObjectOptionsBegin((PetscObject)B); 2204 { 2205 PetscInt newsh = -1; 2206 PetscBool flg; 2207 #if defined(PETSC_HAVE_CUDA) 2208 PetscInt chunksize = 0; 2209 #endif 2210 2211 PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg)); 2212 if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); } 2213 #if defined(PETSC_HAVE_CUDA) 2214 PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg)); 2215 if (flg) { 2216 PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize); 2217 b->chunksize = chunksize; 2218 } 2219 #endif 2220 } 2221 PetscOptionsEnd(); 2222 PetscFunctionReturn(PETSC_SUCCESS); 2223 } 2224 2225 /* 2226 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 2227 */ 2228 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 2229 { 2230 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 2231 PetscInt i, m = A->rmap->n; 2232 PetscInt totalslices = a->totalslices; 2233 2234 PetscFunctionBegin; 2235 C->factortype = A->factortype; 2236 c->row = NULL; 2237 c->col = NULL; 2238 c->icol = NULL; 2239 c->reallocs = 0; 2240 C->assembled = PETSC_TRUE; 2241 2242 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2243 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2244 2245 PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen)); 2246 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2247 2248 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2249 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2250 2251 /* allocate the matrix space */ 2252 if (mallocmatspace) { 2253 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2254 2255 c->singlemalloc = PETSC_TRUE; 2256 2257 if (m > 0) { 2258 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2259 if (cpvalues == MAT_COPY_VALUES) { 2260 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2261 } else { 2262 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2263 } 2264 } 2265 } 2266 2267 c->ignorezeroentries = a->ignorezeroentries; 2268 c->roworiented = a->roworiented; 2269 c->nonew = a->nonew; 2270 if (a->diag) { 2271 PetscCall(PetscMalloc1(m, &c->diag)); 2272 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2273 } else c->diag = NULL; 2274 2275 c->solve_work = NULL; 2276 c->saved_values = NULL; 2277 c->idiag = NULL; 2278 c->ssor_work = NULL; 2279 c->keepnonzeropattern = a->keepnonzeropattern; 2280 c->free_val = PETSC_TRUE; 2281 c->free_colidx = PETSC_TRUE; 2282 2283 c->maxallocmat = a->maxallocmat; 2284 c->maxallocrow = a->maxallocrow; 2285 c->rlenmax = a->rlenmax; 2286 c->nz = a->nz; 2287 C->preallocated = PETSC_TRUE; 2288 2289 c->nonzerorowcnt = a->nonzerorowcnt; 2290 C->nonzerostate = A->nonzerostate; 2291 2292 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2293 PetscFunctionReturn(PETSC_SUCCESS); 2294 } 2295 2296 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2297 { 2298 PetscFunctionBegin; 2299 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2300 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2301 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2302 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2303 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2304 PetscFunctionReturn(PETSC_SUCCESS); 2305 } 2306 2307 /*MC 2308 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2309 based on the sliced Ellpack format 2310 2311 Options Database Key: 2312 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2313 2314 Level: beginner 2315 2316 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2317 M*/ 2318 2319 /*MC 2320 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2321 2322 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2323 and `MATMPISELL` otherwise. As a result, for single process communicators, 2324 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2325 for communicators controlling multiple processes. It is recommended that you call both of 2326 the above preallocation routines for simplicity. 2327 2328 Options Database Key: 2329 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2330 2331 Level: beginner 2332 2333 Notes: 2334 This format is only supported for real scalars, double precision, and 32-bit indices (the defaults). 2335 2336 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2337 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2338 2339 Developer Notes: 2340 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2341 2342 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2343 .vb 2344 (2 0 3 4) 2345 Consider the matrix A = (5 0 6 0) 2346 (0 0 7 8) 2347 (0 0 9 9) 2348 2349 symbolically the Ellpack format can be written as 2350 2351 (2 3 4 |) (0 2 3 |) 2352 v = (5 6 0 |) colidx = (0 2 2 |) 2353 -------- --------- 2354 (7 8 |) (2 3 |) 2355 (9 9 |) (2 3 |) 2356 2357 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2358 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2359 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2360 2361 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2362 2363 .ve 2364 2365 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2366 2367 References: 2368 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2369 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2370 2371 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2372 M*/ 2373 2374 /*@C 2375 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2376 2377 Collective 2378 2379 Input Parameters: 2380 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2381 . m - number of rows 2382 . n - number of columns 2383 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided 2384 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL 2385 2386 Output Parameter: 2387 . A - the matrix 2388 2389 Level: intermediate 2390 2391 Notes: 2392 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2393 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2394 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2395 2396 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 2397 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 2398 allocation. 2399 2400 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL` 2401 @*/ 2402 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A) 2403 { 2404 PetscFunctionBegin; 2405 PetscCall(MatCreate(comm, A)); 2406 PetscCall(MatSetSizes(*A, m, n, m, n)); 2407 PetscCall(MatSetType(*A, MATSEQSELL)); 2408 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen)); 2409 PetscFunctionReturn(PETSC_SUCCESS); 2410 } 2411 2412 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2413 { 2414 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2415 PetscInt totalslices = a->totalslices; 2416 2417 PetscFunctionBegin; 2418 /* If the matrix dimensions are not equal,or no of nonzeros */ 2419 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2420 *flg = PETSC_FALSE; 2421 PetscFunctionReturn(PETSC_SUCCESS); 2422 } 2423 /* if the a->colidx are the same */ 2424 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2425 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2426 /* if a->val are the same */ 2427 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2428 PetscFunctionReturn(PETSC_SUCCESS); 2429 } 2430 2431 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2432 { 2433 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2434 2435 PetscFunctionBegin; 2436 a->idiagvalid = PETSC_FALSE; 2437 PetscFunctionReturn(PETSC_SUCCESS); 2438 } 2439 2440 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2441 { 2442 #if defined(PETSC_USE_COMPLEX) 2443 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2444 PetscInt i; 2445 PetscScalar *val = a->val; 2446 2447 PetscFunctionBegin; 2448 for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); } 2449 #if defined(PETSC_HAVE_CUDA) 2450 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 2451 #endif 2452 #else 2453 PetscFunctionBegin; 2454 #endif 2455 PetscFunctionReturn(PETSC_SUCCESS); 2456 } 2457