1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 11 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 12 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 13 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 14 " year = 2018\n" 15 "}\n"; 16 17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 18 19 #include <immintrin.h> 20 21 #if !defined(_MM_SCALE_8) 22 #define _MM_SCALE_8 8 23 #endif 24 25 #if defined(__AVX512F__) 26 /* these do not work 27 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 28 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 29 */ 30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 31 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 32 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 33 vec_vals = _mm512_loadu_pd(aval); \ 34 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 35 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 36 #elif defined(__AVX2__) && defined(__FMA__) 37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 38 vec_vals = _mm256_loadu_pd(aval); \ 39 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 40 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 41 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 42 #endif 43 #endif /* PETSC_HAVE_IMMINTRIN_H */ 44 45 /*@C 46 MatSeqSELLSetPreallocation - For good matrix assembly performance 47 the user should preallocate the matrix storage by setting the parameter `nz` 48 (or the array `nnz`). 49 50 Collective 51 52 Input Parameters: 53 + B - The `MATSEQSELL` matrix 54 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided 55 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 56 57 Level: intermediate 58 59 Notes: 60 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 61 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 62 allocation. 63 64 You can call `MatGetInfo()` to get information on how effective the preallocation was; 65 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 66 You can also run with the option `-info` and look for messages with the string 67 malloc in them to see if additional memory allocation was needed. 68 69 Developer Note: 70 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 71 entries or columns indices. 72 73 The maximum number of nonzeos in any row should be as accurate as possible. 74 If it is underestimated, you will get bad performance due to reallocation 75 (`MatSeqXSELLReallocateSELL()`). 76 77 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 78 @*/ 79 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 80 { 81 PetscFunctionBegin; 82 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 83 PetscValidType(B, 1); 84 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 85 PetscFunctionReturn(PETSC_SUCCESS); 86 } 87 88 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 89 { 90 Mat_SeqSELL *b; 91 PetscInt i, j, totalslices; 92 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 93 94 PetscFunctionBegin; 95 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 96 if (maxallocrow == MAT_SKIP_ALLOCATION) { 97 skipallocation = PETSC_TRUE; 98 maxallocrow = 0; 99 } 100 101 PetscCall(PetscLayoutSetUp(B->rmap)); 102 PetscCall(PetscLayoutSetUp(B->cmap)); 103 104 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 105 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 106 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 107 if (rlen) { 108 for (i = 0; i < B->rmap->n; i++) { 109 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 110 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 111 } 112 } 113 114 B->preallocated = PETSC_TRUE; 115 116 b = (Mat_SeqSELL *)B->data; 117 118 totalslices = PetscCeilInt(B->rmap->n, 8); 119 b->totalslices = totalslices; 120 if (!skipallocation) { 121 if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n)); 122 123 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 124 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 125 } 126 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 127 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 128 else if (maxallocrow < 0) maxallocrow = 1; 129 for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow; 130 } else { 131 maxallocrow = 0; 132 b->sliidx[0] = 0; 133 for (i = 1; i < totalslices; i++) { 134 b->sliidx[i] = 0; 135 for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]); 136 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 137 PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i])); 138 } 139 /* last slice */ 140 b->sliidx[totalslices] = 0; 141 for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 142 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 143 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices]; 144 } 145 146 /* allocate space for val, colidx, rlen */ 147 /* FIXME: should B's old memory be unlogged? */ 148 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 149 /* FIXME: assuming an element of the bit array takes 8 bits */ 150 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 151 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 152 PetscCall(PetscCalloc1(8 * totalslices, &b->rlen)); 153 154 b->singlemalloc = PETSC_TRUE; 155 b->free_val = PETSC_TRUE; 156 b->free_colidx = PETSC_TRUE; 157 } else { 158 b->free_val = PETSC_FALSE; 159 b->free_colidx = PETSC_FALSE; 160 } 161 162 b->nz = 0; 163 b->maxallocrow = maxallocrow; 164 b->rlenmax = maxallocrow; 165 b->maxallocmat = b->sliidx[totalslices]; 166 B->info.nz_unneeded = (double)b->maxallocmat; 167 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 168 PetscFunctionReturn(PETSC_SUCCESS); 169 } 170 171 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 172 { 173 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 174 PetscInt shift; 175 176 PetscFunctionBegin; 177 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 178 if (nz) *nz = a->rlen[row]; 179 shift = a->sliidx[row >> 3] + (row & 0x07); 180 if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); 181 if (idx) { 182 PetscInt j; 183 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j]; 184 *idx = a->getrowcols; 185 } 186 if (v) { 187 PetscInt j; 188 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j]; 189 *v = a->getrowvals; 190 } 191 PetscFunctionReturn(PETSC_SUCCESS); 192 } 193 194 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 195 { 196 PetscFunctionBegin; 197 PetscFunctionReturn(PETSC_SUCCESS); 198 } 199 200 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 201 { 202 Mat B; 203 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 204 PetscInt i; 205 206 PetscFunctionBegin; 207 if (reuse == MAT_REUSE_MATRIX) { 208 B = *newmat; 209 PetscCall(MatZeroEntries(B)); 210 } else { 211 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 212 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 213 PetscCall(MatSetType(B, MATSEQAIJ)); 214 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 215 } 216 217 for (i = 0; i < A->rmap->n; i++) { 218 PetscInt nz = 0, *cols = NULL; 219 PetscScalar *vals = NULL; 220 221 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 222 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 223 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 224 } 225 226 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 227 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 228 B->rmap->bs = A->rmap->bs; 229 230 if (reuse == MAT_INPLACE_MATRIX) { 231 PetscCall(MatHeaderReplace(A, &B)); 232 } else { 233 *newmat = B; 234 } 235 PetscFunctionReturn(PETSC_SUCCESS); 236 } 237 238 #include <../src/mat/impls/aij/seq/aij.h> 239 240 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 241 { 242 Mat B; 243 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 244 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 245 const PetscInt *cols; 246 const PetscScalar *vals; 247 248 PetscFunctionBegin; 249 250 if (reuse == MAT_REUSE_MATRIX) { 251 B = *newmat; 252 } else { 253 if (PetscDefined(USE_DEBUG) || !a->ilen) { 254 PetscCall(PetscMalloc1(m, &rowlengths)); 255 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 256 } 257 if (PetscDefined(USE_DEBUG) && a->ilen) { 258 PetscBool eq; 259 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 260 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 261 PetscCall(PetscFree(rowlengths)); 262 rowlengths = a->ilen; 263 } else if (a->ilen) rowlengths = a->ilen; 264 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 265 PetscCall(MatSetSizes(B, m, n, m, n)); 266 PetscCall(MatSetType(B, MATSEQSELL)); 267 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 268 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 269 } 270 271 for (row = 0; row < m; row++) { 272 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 273 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 274 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 275 } 276 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 277 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 278 B->rmap->bs = A->rmap->bs; 279 280 if (reuse == MAT_INPLACE_MATRIX) { 281 PetscCall(MatHeaderReplace(A, &B)); 282 } else { 283 *newmat = B; 284 } 285 PetscFunctionReturn(PETSC_SUCCESS); 286 } 287 288 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 289 { 290 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 291 PetscScalar *y; 292 const PetscScalar *x; 293 const MatScalar *aval = a->val; 294 PetscInt totalslices = a->totalslices; 295 const PetscInt *acolidx = a->colidx; 296 PetscInt i, j; 297 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 298 __m512d vec_x, vec_y, vec_vals; 299 __m256i vec_idx; 300 __mmask8 mask; 301 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 302 __m256i vec_idx2, vec_idx3, vec_idx4; 303 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 304 __m128i vec_idx; 305 __m256d vec_x, vec_y, vec_y2, vec_vals; 306 MatScalar yval; 307 PetscInt r, rows_left, row, nnz_in_row; 308 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 309 __m128d vec_x_tmp; 310 __m256d vec_x, vec_y, vec_y2, vec_vals; 311 MatScalar yval; 312 PetscInt r, rows_left, row, nnz_in_row; 313 #else 314 PetscScalar sum[8]; 315 #endif 316 317 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 318 #pragma disjoint(*x, *y, *aval) 319 #endif 320 321 PetscFunctionBegin; 322 PetscCall(VecGetArrayRead(xx, &x)); 323 PetscCall(VecGetArray(yy, &y)); 324 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 325 for (i = 0; i < totalslices; i++) { /* loop over slices */ 326 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 327 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 328 329 vec_y = _mm512_setzero_pd(); 330 vec_y2 = _mm512_setzero_pd(); 331 vec_y3 = _mm512_setzero_pd(); 332 vec_y4 = _mm512_setzero_pd(); 333 334 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 335 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 336 case 3: 337 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 338 acolidx += 8; 339 aval += 8; 340 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 341 acolidx += 8; 342 aval += 8; 343 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 344 acolidx += 8; 345 aval += 8; 346 j += 3; 347 break; 348 case 2: 349 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 350 acolidx += 8; 351 aval += 8; 352 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 353 acolidx += 8; 354 aval += 8; 355 j += 2; 356 break; 357 case 1: 358 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 359 acolidx += 8; 360 aval += 8; 361 j += 1; 362 break; 363 } 364 #pragma novector 365 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 366 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 367 acolidx += 8; 368 aval += 8; 369 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 370 acolidx += 8; 371 aval += 8; 372 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 373 acolidx += 8; 374 aval += 8; 375 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 376 acolidx += 8; 377 aval += 8; 378 } 379 380 vec_y = _mm512_add_pd(vec_y, vec_y2); 381 vec_y = _mm512_add_pd(vec_y, vec_y3); 382 vec_y = _mm512_add_pd(vec_y, vec_y4); 383 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 384 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 385 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 386 } else { 387 _mm512_storeu_pd(&y[8 * i], vec_y); 388 } 389 } 390 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 391 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 392 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 393 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 394 395 /* last slice may have padding rows. Don't use vectorization. */ 396 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 397 rows_left = A->rmap->n - 8 * i; 398 for (r = 0; r < rows_left; ++r) { 399 yval = (MatScalar)0; 400 row = 8 * i + r; 401 nnz_in_row = a->rlen[row]; 402 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 403 y[row] = yval; 404 } 405 break; 406 } 407 408 vec_y = _mm256_setzero_pd(); 409 vec_y2 = _mm256_setzero_pd(); 410 411 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 412 #pragma novector 413 #pragma unroll(2) 414 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 415 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 416 aval += 4; 417 acolidx += 4; 418 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 419 aval += 4; 420 acolidx += 4; 421 } 422 423 _mm256_storeu_pd(y + i * 8, vec_y); 424 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 425 } 426 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 427 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 428 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 429 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 430 431 vec_y = _mm256_setzero_pd(); 432 vec_y2 = _mm256_setzero_pd(); 433 434 /* last slice may have padding rows. Don't use vectorization. */ 435 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 436 rows_left = A->rmap->n - 8 * i; 437 for (r = 0; r < rows_left; ++r) { 438 yval = (MatScalar)0; 439 row = 8 * i + r; 440 nnz_in_row = a->rlen[row]; 441 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 442 y[row] = yval; 443 } 444 break; 445 } 446 447 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 448 #pragma novector 449 #pragma unroll(2) 450 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 451 vec_vals = _mm256_loadu_pd(aval); 452 vec_x_tmp = _mm_setzero_pd(); 453 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 454 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 455 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 456 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 457 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 458 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 459 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 460 aval += 4; 461 462 vec_vals = _mm256_loadu_pd(aval); 463 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 464 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 465 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 466 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 467 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 468 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 469 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 470 aval += 4; 471 } 472 473 _mm256_storeu_pd(y + i * 8, vec_y); 474 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 475 } 476 #else 477 for (i = 0; i < totalslices; i++) { /* loop over slices */ 478 for (j = 0; j < 8; j++) sum[j] = 0.0; 479 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 480 sum[0] += aval[j] * x[acolidx[j]]; 481 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 482 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 483 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 484 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 485 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 486 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 487 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 488 } 489 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 490 for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j]; 491 } else { 492 for (j = 0; j < 8; j++) y[8 * i + j] = sum[j]; 493 } 494 } 495 #endif 496 497 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 498 PetscCall(VecRestoreArrayRead(xx, &x)); 499 PetscCall(VecRestoreArray(yy, &y)); 500 PetscFunctionReturn(PETSC_SUCCESS); 501 } 502 503 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 504 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 505 { 506 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 507 PetscScalar *y, *z; 508 const PetscScalar *x; 509 const MatScalar *aval = a->val; 510 PetscInt totalslices = a->totalslices; 511 const PetscInt *acolidx = a->colidx; 512 PetscInt i, j; 513 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 514 __m512d vec_x, vec_y, vec_vals; 515 __m256i vec_idx; 516 __mmask8 mask; 517 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 518 __m256i vec_idx2, vec_idx3, vec_idx4; 519 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 520 __m128d vec_x_tmp; 521 __m256d vec_x, vec_y, vec_y2, vec_vals; 522 MatScalar yval; 523 PetscInt r, row, nnz_in_row; 524 #else 525 PetscScalar sum[8]; 526 #endif 527 528 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 529 #pragma disjoint(*x, *y, *aval) 530 #endif 531 532 PetscFunctionBegin; 533 PetscCall(VecGetArrayRead(xx, &x)); 534 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 535 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 536 for (i = 0; i < totalslices; i++) { /* loop over slices */ 537 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 538 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 539 540 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 541 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 542 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 543 } else { 544 vec_y = _mm512_loadu_pd(&y[8 * i]); 545 } 546 vec_y2 = _mm512_setzero_pd(); 547 vec_y3 = _mm512_setzero_pd(); 548 vec_y4 = _mm512_setzero_pd(); 549 550 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 551 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 552 case 3: 553 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 554 acolidx += 8; 555 aval += 8; 556 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 557 acolidx += 8; 558 aval += 8; 559 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 560 acolidx += 8; 561 aval += 8; 562 j += 3; 563 break; 564 case 2: 565 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 566 acolidx += 8; 567 aval += 8; 568 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 569 acolidx += 8; 570 aval += 8; 571 j += 2; 572 break; 573 case 1: 574 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 575 acolidx += 8; 576 aval += 8; 577 j += 1; 578 break; 579 } 580 #pragma novector 581 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 582 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 583 acolidx += 8; 584 aval += 8; 585 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 586 acolidx += 8; 587 aval += 8; 588 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 589 acolidx += 8; 590 aval += 8; 591 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 592 acolidx += 8; 593 aval += 8; 594 } 595 596 vec_y = _mm512_add_pd(vec_y, vec_y2); 597 vec_y = _mm512_add_pd(vec_y, vec_y3); 598 vec_y = _mm512_add_pd(vec_y, vec_y4); 599 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 600 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 601 } else { 602 _mm512_storeu_pd(&z[8 * i], vec_y); 603 } 604 } 605 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 606 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 607 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 608 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 609 610 /* last slice may have padding rows. Don't use vectorization. */ 611 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 612 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 613 row = 8 * i + r; 614 yval = (MatScalar)0.0; 615 nnz_in_row = a->rlen[row]; 616 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 617 z[row] = y[row] + yval; 618 } 619 break; 620 } 621 622 vec_y = _mm256_loadu_pd(y + 8 * i); 623 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 624 625 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 626 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 627 vec_vals = _mm256_loadu_pd(aval); 628 vec_x_tmp = _mm_setzero_pd(); 629 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 630 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 631 vec_x = _mm256_setzero_pd(); 632 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 633 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 634 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 635 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 636 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 637 aval += 4; 638 639 vec_vals = _mm256_loadu_pd(aval); 640 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 641 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 642 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 643 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 644 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 645 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 646 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 647 aval += 4; 648 } 649 650 _mm256_storeu_pd(z + i * 8, vec_y); 651 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 652 } 653 #else 654 for (i = 0; i < totalslices; i++) { /* loop over slices */ 655 for (j = 0; j < 8; j++) sum[j] = 0.0; 656 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 657 sum[0] += aval[j] * x[acolidx[j]]; 658 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 659 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 660 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 661 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 662 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 663 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 664 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 665 } 666 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 667 for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 668 } else { 669 for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 670 } 671 } 672 #endif 673 674 PetscCall(PetscLogFlops(2.0 * a->nz)); 675 PetscCall(VecRestoreArrayRead(xx, &x)); 676 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 677 PetscFunctionReturn(PETSC_SUCCESS); 678 } 679 680 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 681 { 682 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 683 PetscScalar *y; 684 const PetscScalar *x; 685 const MatScalar *aval = a->val; 686 const PetscInt *acolidx = a->colidx; 687 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices; 688 689 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 690 #pragma disjoint(*x, *y, *aval) 691 #endif 692 693 PetscFunctionBegin; 694 if (A->symmetric == PETSC_BOOL3_TRUE) { 695 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 696 PetscFunctionReturn(PETSC_SUCCESS); 697 } 698 if (zz != yy) PetscCall(VecCopy(zz, yy)); 699 PetscCall(VecGetArrayRead(xx, &x)); 700 PetscCall(VecGetArray(yy, &y)); 701 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 702 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 703 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 704 row = 8 * i + r; 705 nnz_in_row = a->rlen[row]; 706 for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row]; 707 } 708 break; 709 } 710 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 711 y[acolidx[j]] += aval[j] * x[8 * i]; 712 y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1]; 713 y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2]; 714 y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3]; 715 y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4]; 716 y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5]; 717 y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6]; 718 y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7]; 719 } 720 } 721 PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices])); 722 PetscCall(VecRestoreArrayRead(xx, &x)); 723 PetscCall(VecRestoreArray(yy, &y)); 724 PetscFunctionReturn(PETSC_SUCCESS); 725 } 726 727 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 728 { 729 PetscFunctionBegin; 730 if (A->symmetric == PETSC_BOOL3_TRUE) { 731 PetscCall(MatMult_SeqSELL(A, xx, yy)); 732 } else { 733 PetscCall(VecSet(yy, 0.0)); 734 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 735 } 736 PetscFunctionReturn(PETSC_SUCCESS); 737 } 738 739 /* 740 Checks for missing diagonals 741 */ 742 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 743 { 744 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 745 PetscInt *diag, i; 746 747 PetscFunctionBegin; 748 *missing = PETSC_FALSE; 749 if (A->rmap->n > 0 && !(a->colidx)) { 750 *missing = PETSC_TRUE; 751 if (d) *d = 0; 752 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 753 } else { 754 diag = a->diag; 755 for (i = 0; i < A->rmap->n; i++) { 756 if (diag[i] == -1) { 757 *missing = PETSC_TRUE; 758 if (d) *d = i; 759 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 760 break; 761 } 762 } 763 } 764 PetscFunctionReturn(PETSC_SUCCESS); 765 } 766 767 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 768 { 769 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 770 PetscInt i, j, m = A->rmap->n, shift; 771 772 PetscFunctionBegin; 773 if (!a->diag) { 774 PetscCall(PetscMalloc1(m, &a->diag)); 775 a->free_diag = PETSC_TRUE; 776 } 777 for (i = 0; i < m; i++) { /* loop over rows */ 778 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 779 a->diag[i] = -1; 780 for (j = 0; j < a->rlen[i]; j++) { 781 if (a->colidx[shift + j * 8] == i) { 782 a->diag[i] = shift + j * 8; 783 break; 784 } 785 } 786 } 787 PetscFunctionReturn(PETSC_SUCCESS); 788 } 789 790 /* 791 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 792 */ 793 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 794 { 795 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 796 PetscInt i, *diag, m = A->rmap->n; 797 MatScalar *val = a->val; 798 PetscScalar *idiag, *mdiag; 799 800 PetscFunctionBegin; 801 if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS); 802 PetscCall(MatMarkDiagonal_SeqSELL(A)); 803 diag = a->diag; 804 if (!a->idiag) { 805 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 806 val = a->val; 807 } 808 mdiag = a->mdiag; 809 idiag = a->idiag; 810 811 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 812 for (i = 0; i < m; i++) { 813 mdiag[i] = val[diag[i]]; 814 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 815 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 816 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 817 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 818 A->factorerror_zeropivot_value = 0.0; 819 A->factorerror_zeropivot_row = i; 820 } 821 idiag[i] = 1.0 / val[diag[i]]; 822 } 823 PetscCall(PetscLogFlops(m)); 824 } else { 825 for (i = 0; i < m; i++) { 826 mdiag[i] = val[diag[i]]; 827 idiag[i] = omega / (fshift + val[diag[i]]); 828 } 829 PetscCall(PetscLogFlops(2.0 * m)); 830 } 831 a->idiagvalid = PETSC_TRUE; 832 PetscFunctionReturn(PETSC_SUCCESS); 833 } 834 835 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 836 { 837 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 838 839 PetscFunctionBegin; 840 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 841 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 842 PetscFunctionReturn(PETSC_SUCCESS); 843 } 844 845 PetscErrorCode MatDestroy_SeqSELL(Mat A) 846 { 847 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 848 849 PetscFunctionBegin; 850 #if defined(PETSC_USE_LOG) 851 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 852 #endif 853 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 854 PetscCall(ISDestroy(&a->row)); 855 PetscCall(ISDestroy(&a->col)); 856 PetscCall(PetscFree(a->diag)); 857 PetscCall(PetscFree(a->rlen)); 858 PetscCall(PetscFree(a->sliidx)); 859 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 860 PetscCall(PetscFree(a->solve_work)); 861 PetscCall(ISDestroy(&a->icol)); 862 PetscCall(PetscFree(a->saved_values)); 863 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 864 865 PetscCall(PetscFree(A->data)); 866 867 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 868 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 869 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 870 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 871 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 872 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 873 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 874 PetscFunctionReturn(PETSC_SUCCESS); 875 } 876 877 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 878 { 879 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 880 881 PetscFunctionBegin; 882 switch (op) { 883 case MAT_ROW_ORIENTED: 884 a->roworiented = flg; 885 break; 886 case MAT_KEEP_NONZERO_PATTERN: 887 a->keepnonzeropattern = flg; 888 break; 889 case MAT_NEW_NONZERO_LOCATIONS: 890 a->nonew = (flg ? 0 : 1); 891 break; 892 case MAT_NEW_NONZERO_LOCATION_ERR: 893 a->nonew = (flg ? -1 : 0); 894 break; 895 case MAT_NEW_NONZERO_ALLOCATION_ERR: 896 a->nonew = (flg ? -2 : 0); 897 break; 898 case MAT_UNUSED_NONZERO_LOCATION_ERR: 899 a->nounused = (flg ? -1 : 0); 900 break; 901 case MAT_FORCE_DIAGONAL_ENTRIES: 902 case MAT_IGNORE_OFF_PROC_ENTRIES: 903 case MAT_USE_HASH_TABLE: 904 case MAT_SORTED_FULL: 905 PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); 906 break; 907 case MAT_SPD: 908 case MAT_SYMMETRIC: 909 case MAT_STRUCTURALLY_SYMMETRIC: 910 case MAT_HERMITIAN: 911 case MAT_SYMMETRY_ETERNAL: 912 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 913 case MAT_SPD_ETERNAL: 914 /* These options are handled directly by MatSetOption() */ 915 break; 916 default: 917 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 918 } 919 PetscFunctionReturn(PETSC_SUCCESS); 920 } 921 922 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 923 { 924 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 925 PetscInt i, j, n, shift; 926 PetscScalar *x, zero = 0.0; 927 928 PetscFunctionBegin; 929 PetscCall(VecGetLocalSize(v, &n)); 930 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 931 932 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 933 PetscInt *diag = a->diag; 934 PetscCall(VecGetArray(v, &x)); 935 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 936 PetscCall(VecRestoreArray(v, &x)); 937 PetscFunctionReturn(PETSC_SUCCESS); 938 } 939 940 PetscCall(VecSet(v, zero)); 941 PetscCall(VecGetArray(v, &x)); 942 for (i = 0; i < n; i++) { /* loop over rows */ 943 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 944 x[i] = 0; 945 for (j = 0; j < a->rlen[i]; j++) { 946 if (a->colidx[shift + j * 8] == i) { 947 x[i] = a->val[shift + j * 8]; 948 break; 949 } 950 } 951 } 952 PetscCall(VecRestoreArray(v, &x)); 953 PetscFunctionReturn(PETSC_SUCCESS); 954 } 955 956 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 957 { 958 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 959 const PetscScalar *l, *r; 960 PetscInt i, j, m, n, row; 961 962 PetscFunctionBegin; 963 if (ll) { 964 /* The local size is used so that VecMPI can be passed to this routine 965 by MatDiagonalScale_MPISELL */ 966 PetscCall(VecGetLocalSize(ll, &m)); 967 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 968 PetscCall(VecGetArrayRead(ll, &l)); 969 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 970 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 971 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 972 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row]; 973 } 974 } else { 975 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row]; 976 } 977 } 978 PetscCall(VecRestoreArrayRead(ll, &l)); 979 PetscCall(PetscLogFlops(a->nz)); 980 } 981 if (rr) { 982 PetscCall(VecGetLocalSize(rr, &n)); 983 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 984 PetscCall(VecGetArrayRead(rr, &r)); 985 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 986 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 987 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 988 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 989 } 990 } else { 991 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 992 } 993 } 994 PetscCall(VecRestoreArrayRead(rr, &r)); 995 PetscCall(PetscLogFlops(a->nz)); 996 } 997 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 998 PetscFunctionReturn(PETSC_SUCCESS); 999 } 1000 1001 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1002 { 1003 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1004 PetscInt *cp, i, k, low, high, t, row, col, l; 1005 PetscInt shift; 1006 MatScalar *vp; 1007 1008 PetscFunctionBegin; 1009 for (k = 0; k < m; k++) { /* loop over requested rows */ 1010 row = im[k]; 1011 if (row < 0) continue; 1012 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1013 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 1014 cp = a->colidx + shift; /* pointer to the row */ 1015 vp = a->val + shift; /* pointer to the row */ 1016 for (l = 0; l < n; l++) { /* loop over requested columns */ 1017 col = in[l]; 1018 if (col < 0) continue; 1019 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1020 high = a->rlen[row]; 1021 low = 0; /* assume unsorted */ 1022 while (high - low > 5) { 1023 t = (low + high) / 2; 1024 if (*(cp + t * 8) > col) high = t; 1025 else low = t; 1026 } 1027 for (i = low; i < high; i++) { 1028 if (*(cp + 8 * i) > col) break; 1029 if (*(cp + 8 * i) == col) { 1030 *v++ = *(vp + 8 * i); 1031 goto finished; 1032 } 1033 } 1034 *v++ = 0.0; 1035 finished:; 1036 } 1037 } 1038 PetscFunctionReturn(PETSC_SUCCESS); 1039 } 1040 1041 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1042 { 1043 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1044 PetscInt i, j, m = A->rmap->n, shift; 1045 const char *name; 1046 PetscViewerFormat format; 1047 1048 PetscFunctionBegin; 1049 PetscCall(PetscViewerGetFormat(viewer, &format)); 1050 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1051 PetscInt nofinalvalue = 0; 1052 /* 1053 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1054 nofinalvalue = 1; 1055 } 1056 */ 1057 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1058 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1059 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1060 #if defined(PETSC_USE_COMPLEX) 1061 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1062 #else 1063 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1064 #endif 1065 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1066 1067 for (i = 0; i < m; i++) { 1068 shift = a->sliidx[i >> 3] + (i & 0x07); 1069 for (j = 0; j < a->rlen[i]; j++) { 1070 #if defined(PETSC_USE_COMPLEX) 1071 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1072 #else 1073 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j])); 1074 #endif 1075 } 1076 } 1077 /* 1078 if (nofinalvalue) { 1079 #if defined(PETSC_USE_COMPLEX) 1080 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1081 #else 1082 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1083 #endif 1084 } 1085 */ 1086 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1087 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1088 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1089 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1090 PetscFunctionReturn(PETSC_SUCCESS); 1091 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1092 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1093 for (i = 0; i < m; i++) { 1094 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1095 shift = a->sliidx[i >> 3] + (i & 0x07); 1096 for (j = 0; j < a->rlen[i]; j++) { 1097 #if defined(PETSC_USE_COMPLEX) 1098 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1099 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1100 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1101 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1102 } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1103 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1104 } 1105 #else 1106 if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1107 #endif 1108 } 1109 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1110 } 1111 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1112 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1113 PetscInt cnt = 0, jcnt; 1114 PetscScalar value; 1115 #if defined(PETSC_USE_COMPLEX) 1116 PetscBool realonly = PETSC_TRUE; 1117 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1118 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1119 realonly = PETSC_FALSE; 1120 break; 1121 } 1122 } 1123 #endif 1124 1125 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1126 for (i = 0; i < m; i++) { 1127 jcnt = 0; 1128 shift = a->sliidx[i >> 3] + (i & 0x07); 1129 for (j = 0; j < A->cmap->n; j++) { 1130 if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) { 1131 value = a->val[cnt++]; 1132 jcnt++; 1133 } else { 1134 value = 0.0; 1135 } 1136 #if defined(PETSC_USE_COMPLEX) 1137 if (realonly) { 1138 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1139 } else { 1140 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1141 } 1142 #else 1143 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1144 #endif 1145 } 1146 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1147 } 1148 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1149 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1150 PetscInt fshift = 1; 1151 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1152 #if defined(PETSC_USE_COMPLEX) 1153 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1154 #else 1155 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1156 #endif 1157 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1158 for (i = 0; i < m; i++) { 1159 shift = a->sliidx[i >> 3] + (i & 0x07); 1160 for (j = 0; j < a->rlen[i]; j++) { 1161 #if defined(PETSC_USE_COMPLEX) 1162 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1163 #else 1164 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j])); 1165 #endif 1166 } 1167 } 1168 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1169 } else if (format == PETSC_VIEWER_NATIVE) { 1170 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1171 PetscInt row; 1172 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1173 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 1174 #if defined(PETSC_USE_COMPLEX) 1175 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1176 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1177 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1178 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1179 } else { 1180 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1181 } 1182 #else 1183 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j])); 1184 #endif 1185 } 1186 } 1187 } else { 1188 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1189 if (A->factortype) { 1190 for (i = 0; i < m; i++) { 1191 shift = a->sliidx[i >> 3] + (i & 0x07); 1192 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1193 /* L part */ 1194 for (j = shift; j < a->diag[i]; j += 8) { 1195 #if defined(PETSC_USE_COMPLEX) 1196 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) { 1197 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1198 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) { 1199 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1200 } else { 1201 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1202 } 1203 #else 1204 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1205 #endif 1206 } 1207 /* diagonal */ 1208 j = a->diag[i]; 1209 #if defined(PETSC_USE_COMPLEX) 1210 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1211 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1212 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1213 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1214 } else { 1215 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1216 } 1217 #else 1218 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1219 #endif 1220 1221 /* U part */ 1222 for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) { 1223 #if defined(PETSC_USE_COMPLEX) 1224 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1225 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1226 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1227 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1228 } else { 1229 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1230 } 1231 #else 1232 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1233 #endif 1234 } 1235 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1236 } 1237 } else { 1238 for (i = 0; i < m; i++) { 1239 shift = a->sliidx[i >> 3] + (i & 0x07); 1240 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1241 for (j = 0; j < a->rlen[i]; j++) { 1242 #if defined(PETSC_USE_COMPLEX) 1243 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1244 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1245 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1246 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1247 } else { 1248 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1249 } 1250 #else 1251 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1252 #endif 1253 } 1254 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1255 } 1256 } 1257 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1258 } 1259 PetscCall(PetscViewerFlush(viewer)); 1260 PetscFunctionReturn(PETSC_SUCCESS); 1261 } 1262 1263 #include <petscdraw.h> 1264 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1265 { 1266 Mat A = (Mat)Aa; 1267 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1268 PetscInt i, j, m = A->rmap->n, shift; 1269 int color; 1270 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1271 PetscViewer viewer; 1272 PetscViewerFormat format; 1273 1274 PetscFunctionBegin; 1275 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1276 PetscCall(PetscViewerGetFormat(viewer, &format)); 1277 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1278 1279 /* loop over matrix elements drawing boxes */ 1280 1281 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1282 PetscDrawCollectiveBegin(draw); 1283 /* Blue for negative, Cyan for zero and Red for positive */ 1284 color = PETSC_DRAW_BLUE; 1285 for (i = 0; i < m; i++) { 1286 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1287 y_l = m - i - 1.0; 1288 y_r = y_l + 1.0; 1289 for (j = 0; j < a->rlen[i]; j++) { 1290 x_l = a->colidx[shift + j * 8]; 1291 x_r = x_l + 1.0; 1292 if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue; 1293 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1294 } 1295 } 1296 color = PETSC_DRAW_CYAN; 1297 for (i = 0; i < m; i++) { 1298 shift = a->sliidx[i >> 3] + (i & 0x07); 1299 y_l = m - i - 1.0; 1300 y_r = y_l + 1.0; 1301 for (j = 0; j < a->rlen[i]; j++) { 1302 x_l = a->colidx[shift + j * 8]; 1303 x_r = x_l + 1.0; 1304 if (a->val[shift + 8 * j] != 0.) continue; 1305 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1306 } 1307 } 1308 color = PETSC_DRAW_RED; 1309 for (i = 0; i < m; i++) { 1310 shift = a->sliidx[i >> 3] + (i & 0x07); 1311 y_l = m - i - 1.0; 1312 y_r = y_l + 1.0; 1313 for (j = 0; j < a->rlen[i]; j++) { 1314 x_l = a->colidx[shift + j * 8]; 1315 x_r = x_l + 1.0; 1316 if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue; 1317 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1318 } 1319 } 1320 PetscDrawCollectiveEnd(draw); 1321 } else { 1322 /* use contour shading to indicate magnitude of values */ 1323 /* first determine max of all nonzero values */ 1324 PetscReal minv = 0.0, maxv = 0.0; 1325 PetscInt count = 0; 1326 PetscDraw popup; 1327 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1328 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1329 } 1330 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1331 PetscCall(PetscDrawGetPopup(draw, &popup)); 1332 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1333 1334 PetscDrawCollectiveBegin(draw); 1335 for (i = 0; i < m; i++) { 1336 shift = a->sliidx[i >> 3] + (i & 0x07); 1337 y_l = m - i - 1.0; 1338 y_r = y_l + 1.0; 1339 for (j = 0; j < a->rlen[i]; j++) { 1340 x_l = a->colidx[shift + j * 8]; 1341 x_r = x_l + 1.0; 1342 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1343 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1344 count++; 1345 } 1346 } 1347 PetscDrawCollectiveEnd(draw); 1348 } 1349 PetscFunctionReturn(PETSC_SUCCESS); 1350 } 1351 1352 #include <petscdraw.h> 1353 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1354 { 1355 PetscDraw draw; 1356 PetscReal xr, yr, xl, yl, h, w; 1357 PetscBool isnull; 1358 1359 PetscFunctionBegin; 1360 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1361 PetscCall(PetscDrawIsNull(draw, &isnull)); 1362 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1363 1364 xr = A->cmap->n; 1365 yr = A->rmap->n; 1366 h = yr / 10.0; 1367 w = xr / 10.0; 1368 xr += w; 1369 yr += h; 1370 xl = -w; 1371 yl = -h; 1372 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1373 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1374 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1375 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1376 PetscCall(PetscDrawSave(draw)); 1377 PetscFunctionReturn(PETSC_SUCCESS); 1378 } 1379 1380 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1381 { 1382 PetscBool iascii, isbinary, isdraw; 1383 1384 PetscFunctionBegin; 1385 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1386 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1387 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1388 if (iascii) { 1389 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1390 } else if (isbinary) { 1391 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1392 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1393 PetscFunctionReturn(PETSC_SUCCESS); 1394 } 1395 1396 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1397 { 1398 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1399 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1400 MatScalar *vp; 1401 1402 PetscFunctionBegin; 1403 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1404 /* To do: compress out the unused elements */ 1405 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1406 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1407 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1408 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1409 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1410 for (i = 0; i < a->totalslices; ++i) { 1411 shift = a->sliidx[i]; /* starting index of the slice */ 1412 cp = a->colidx + shift; /* pointer to the column indices of the slice */ 1413 vp = a->val + shift; /* pointer to the nonzero values of the slice */ 1414 for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */ 1415 row = 8 * i + row_in_slice; 1416 nrow = a->rlen[row]; /* number of nonzeros in row */ 1417 /* 1418 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1419 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1420 */ 1421 lastcol = 0; 1422 if (nrow > 0) { /* nonempty row */ 1423 lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1424 } else if (!row_in_slice) { /* first row of the correct slice is empty */ 1425 for (j = 1; j < 8; j++) { 1426 if (a->rlen[8 * i + j]) { 1427 lastcol = cp[j]; 1428 break; 1429 } 1430 } 1431 } else { 1432 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1433 } 1434 1435 for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) { 1436 cp[8 * k + row_in_slice] = lastcol; 1437 vp[8 * k + row_in_slice] = (MatScalar)0; 1438 } 1439 } 1440 } 1441 1442 A->info.mallocs += a->reallocs; 1443 a->reallocs = 0; 1444 1445 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1446 PetscFunctionReturn(PETSC_SUCCESS); 1447 } 1448 1449 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1450 { 1451 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1452 1453 PetscFunctionBegin; 1454 info->block_size = 1.0; 1455 info->nz_allocated = a->maxallocmat; 1456 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1457 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1458 info->assemblies = A->num_ass; 1459 info->mallocs = A->info.mallocs; 1460 info->memory = 0; /* REVIEW ME */ 1461 if (A->factortype) { 1462 info->fill_ratio_given = A->info.fill_ratio_given; 1463 info->fill_ratio_needed = A->info.fill_ratio_needed; 1464 info->factor_mallocs = A->info.factor_mallocs; 1465 } else { 1466 info->fill_ratio_given = 0; 1467 info->fill_ratio_needed = 0; 1468 info->factor_mallocs = 0; 1469 } 1470 PetscFunctionReturn(PETSC_SUCCESS); 1471 } 1472 1473 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1474 { 1475 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1476 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1477 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1478 MatScalar *vp, value; 1479 1480 PetscFunctionBegin; 1481 for (k = 0; k < m; k++) { /* loop over added rows */ 1482 row = im[k]; 1483 if (row < 0) continue; 1484 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1485 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 1486 cp = a->colidx + shift; /* pointer to the row */ 1487 vp = a->val + shift; /* pointer to the row */ 1488 nrow = a->rlen[row]; 1489 low = 0; 1490 high = nrow; 1491 1492 for (l = 0; l < n; l++) { /* loop over added columns */ 1493 col = in[l]; 1494 if (col < 0) continue; 1495 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1496 if (a->roworiented) { 1497 value = v[l + k * n]; 1498 } else { 1499 value = v[k + l * m]; 1500 } 1501 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1502 1503 /* search in this row for the specified column, i indicates the column to be set */ 1504 if (col <= lastcol) low = 0; 1505 else high = nrow; 1506 lastcol = col; 1507 while (high - low > 5) { 1508 t = (low + high) / 2; 1509 if (*(cp + t * 8) > col) high = t; 1510 else low = t; 1511 } 1512 for (i = low; i < high; i++) { 1513 if (*(cp + i * 8) > col) break; 1514 if (*(cp + i * 8) == col) { 1515 if (is == ADD_VALUES) *(vp + i * 8) += value; 1516 else *(vp + i * 8) = value; 1517 low = i + 1; 1518 goto noinsert; 1519 } 1520 } 1521 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1522 if (nonew == 1) goto noinsert; 1523 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1524 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1525 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar); 1526 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1527 for (ii = nrow - 1; ii >= i; ii--) { 1528 *(cp + (ii + 1) * 8) = *(cp + ii * 8); 1529 *(vp + (ii + 1) * 8) = *(vp + ii * 8); 1530 } 1531 a->rlen[row]++; 1532 *(cp + i * 8) = col; 1533 *(vp + i * 8) = value; 1534 a->nz++; 1535 A->nonzerostate++; 1536 low = i + 1; 1537 high++; 1538 nrow++; 1539 noinsert:; 1540 } 1541 a->rlen[row] = nrow; 1542 } 1543 PetscFunctionReturn(PETSC_SUCCESS); 1544 } 1545 1546 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1547 { 1548 PetscFunctionBegin; 1549 /* If the two matrices have the same copy implementation, use fast copy. */ 1550 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1551 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1552 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1553 1554 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1555 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1556 } else { 1557 PetscCall(MatCopy_Basic(A, B, str)); 1558 } 1559 PetscFunctionReturn(PETSC_SUCCESS); 1560 } 1561 1562 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1563 { 1564 PetscFunctionBegin; 1565 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1566 PetscFunctionReturn(PETSC_SUCCESS); 1567 } 1568 1569 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1570 { 1571 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1572 1573 PetscFunctionBegin; 1574 *array = a->val; 1575 PetscFunctionReturn(PETSC_SUCCESS); 1576 } 1577 1578 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1579 { 1580 PetscFunctionBegin; 1581 PetscFunctionReturn(PETSC_SUCCESS); 1582 } 1583 1584 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1585 { 1586 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1587 PetscInt i; 1588 MatScalar *aval = a->val; 1589 1590 PetscFunctionBegin; 1591 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]); 1592 PetscFunctionReturn(PETSC_SUCCESS); 1593 } 1594 1595 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1596 { 1597 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1598 PetscInt i; 1599 MatScalar *aval = a->val; 1600 1601 PetscFunctionBegin; 1602 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1603 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1604 PetscFunctionReturn(PETSC_SUCCESS); 1605 } 1606 1607 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1608 { 1609 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1610 MatScalar *aval = a->val; 1611 PetscScalar oalpha = alpha; 1612 PetscBLASInt one = 1, size; 1613 1614 PetscFunctionBegin; 1615 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1616 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1617 PetscCall(PetscLogFlops(a->nz)); 1618 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1619 PetscFunctionReturn(PETSC_SUCCESS); 1620 } 1621 1622 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1623 { 1624 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1625 1626 PetscFunctionBegin; 1627 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1628 PetscCall(MatShift_Basic(Y, a)); 1629 PetscFunctionReturn(PETSC_SUCCESS); 1630 } 1631 1632 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1633 { 1634 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1635 PetscScalar *x, sum, *t; 1636 const MatScalar *idiag = NULL, *mdiag; 1637 const PetscScalar *b, *xb; 1638 PetscInt n, m = A->rmap->n, i, j, shift; 1639 const PetscInt *diag; 1640 1641 PetscFunctionBegin; 1642 its = its * lits; 1643 1644 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1645 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1646 a->fshift = fshift; 1647 a->omega = omega; 1648 1649 diag = a->diag; 1650 t = a->ssor_work; 1651 idiag = a->idiag; 1652 mdiag = a->mdiag; 1653 1654 PetscCall(VecGetArray(xx, &x)); 1655 PetscCall(VecGetArrayRead(bb, &b)); 1656 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1657 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1658 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1659 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1660 1661 if (flag & SOR_ZERO_INITIAL_GUESS) { 1662 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1663 for (i = 0; i < m; i++) { 1664 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1665 sum = b[i]; 1666 n = (diag[i] - shift) / 8; 1667 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1668 t[i] = sum; 1669 x[i] = sum * idiag[i]; 1670 } 1671 xb = t; 1672 PetscCall(PetscLogFlops(a->nz)); 1673 } else xb = b; 1674 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1675 for (i = m - 1; i >= 0; i--) { 1676 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1677 sum = xb[i]; 1678 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1679 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1680 if (xb == b) { 1681 x[i] = sum * idiag[i]; 1682 } else { 1683 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1684 } 1685 } 1686 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1687 } 1688 its--; 1689 } 1690 while (its--) { 1691 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1692 for (i = 0; i < m; i++) { 1693 /* lower */ 1694 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1695 sum = b[i]; 1696 n = (diag[i] - shift) / 8; 1697 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1698 t[i] = sum; /* save application of the lower-triangular part */ 1699 /* upper */ 1700 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1701 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1702 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1703 } 1704 xb = t; 1705 PetscCall(PetscLogFlops(2.0 * a->nz)); 1706 } else xb = b; 1707 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1708 for (i = m - 1; i >= 0; i--) { 1709 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1710 sum = xb[i]; 1711 if (xb == b) { 1712 /* whole matrix (no checkpointing available) */ 1713 n = a->rlen[i]; 1714 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1715 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1716 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1717 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1718 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1719 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1720 } 1721 } 1722 if (xb == b) { 1723 PetscCall(PetscLogFlops(2.0 * a->nz)); 1724 } else { 1725 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1726 } 1727 } 1728 } 1729 PetscCall(VecRestoreArray(xx, &x)); 1730 PetscCall(VecRestoreArrayRead(bb, &b)); 1731 PetscFunctionReturn(PETSC_SUCCESS); 1732 } 1733 1734 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1735 MatGetRow_SeqSELL, 1736 MatRestoreRow_SeqSELL, 1737 MatMult_SeqSELL, 1738 /* 4*/ MatMultAdd_SeqSELL, 1739 MatMultTranspose_SeqSELL, 1740 MatMultTransposeAdd_SeqSELL, 1741 NULL, 1742 NULL, 1743 NULL, 1744 /* 10*/ NULL, 1745 NULL, 1746 NULL, 1747 MatSOR_SeqSELL, 1748 NULL, 1749 /* 15*/ MatGetInfo_SeqSELL, 1750 MatEqual_SeqSELL, 1751 MatGetDiagonal_SeqSELL, 1752 MatDiagonalScale_SeqSELL, 1753 NULL, 1754 /* 20*/ NULL, 1755 MatAssemblyEnd_SeqSELL, 1756 MatSetOption_SeqSELL, 1757 MatZeroEntries_SeqSELL, 1758 /* 24*/ NULL, 1759 NULL, 1760 NULL, 1761 NULL, 1762 NULL, 1763 /* 29*/ MatSetUp_SeqSELL, 1764 NULL, 1765 NULL, 1766 NULL, 1767 NULL, 1768 /* 34*/ MatDuplicate_SeqSELL, 1769 NULL, 1770 NULL, 1771 NULL, 1772 NULL, 1773 /* 39*/ NULL, 1774 NULL, 1775 NULL, 1776 MatGetValues_SeqSELL, 1777 MatCopy_SeqSELL, 1778 /* 44*/ NULL, 1779 MatScale_SeqSELL, 1780 MatShift_SeqSELL, 1781 NULL, 1782 NULL, 1783 /* 49*/ NULL, 1784 NULL, 1785 NULL, 1786 NULL, 1787 NULL, 1788 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1789 NULL, 1790 NULL, 1791 NULL, 1792 NULL, 1793 /* 59*/ NULL, 1794 MatDestroy_SeqSELL, 1795 MatView_SeqSELL, 1796 NULL, 1797 NULL, 1798 /* 64*/ NULL, 1799 NULL, 1800 NULL, 1801 NULL, 1802 NULL, 1803 /* 69*/ NULL, 1804 NULL, 1805 NULL, 1806 NULL, 1807 NULL, 1808 /* 74*/ NULL, 1809 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1810 NULL, 1811 NULL, 1812 NULL, 1813 /* 79*/ NULL, 1814 NULL, 1815 NULL, 1816 NULL, 1817 NULL, 1818 /* 84*/ NULL, 1819 NULL, 1820 NULL, 1821 NULL, 1822 NULL, 1823 /* 89*/ NULL, 1824 NULL, 1825 NULL, 1826 NULL, 1827 NULL, 1828 /* 94*/ NULL, 1829 NULL, 1830 NULL, 1831 NULL, 1832 NULL, 1833 /* 99*/ NULL, 1834 NULL, 1835 NULL, 1836 MatConjugate_SeqSELL, 1837 NULL, 1838 /*104*/ NULL, 1839 NULL, 1840 NULL, 1841 NULL, 1842 NULL, 1843 /*109*/ NULL, 1844 NULL, 1845 NULL, 1846 NULL, 1847 MatMissingDiagonal_SeqSELL, 1848 /*114*/ NULL, 1849 NULL, 1850 NULL, 1851 NULL, 1852 NULL, 1853 /*119*/ NULL, 1854 NULL, 1855 NULL, 1856 NULL, 1857 NULL, 1858 /*124*/ NULL, 1859 NULL, 1860 NULL, 1861 NULL, 1862 NULL, 1863 /*129*/ NULL, 1864 NULL, 1865 NULL, 1866 NULL, 1867 NULL, 1868 /*134*/ NULL, 1869 NULL, 1870 NULL, 1871 NULL, 1872 NULL, 1873 /*139*/ NULL, 1874 NULL, 1875 NULL, 1876 MatFDColoringSetUp_SeqXAIJ, 1877 NULL, 1878 /*144*/ NULL, 1879 NULL, 1880 NULL, 1881 NULL, 1882 NULL, 1883 NULL, 1884 /*150*/ NULL, 1885 NULL}; 1886 1887 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1888 { 1889 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1890 1891 PetscFunctionBegin; 1892 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1893 1894 /* allocate space for values if not already there */ 1895 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1896 1897 /* copy values over */ 1898 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1899 PetscFunctionReturn(PETSC_SUCCESS); 1900 } 1901 1902 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1903 { 1904 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1905 1906 PetscFunctionBegin; 1907 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1908 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1909 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1910 PetscFunctionReturn(PETSC_SUCCESS); 1911 } 1912 1913 /*@C 1914 MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()` 1915 1916 Not Collective 1917 1918 Input Parameters: 1919 + mat - a `MATSEQSELL` matrix 1920 - array - pointer to the data 1921 1922 Level: intermediate 1923 1924 .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 1925 @*/ 1926 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) 1927 { 1928 PetscFunctionBegin; 1929 PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array)); 1930 PetscFunctionReturn(PETSC_SUCCESS); 1931 } 1932 1933 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 1934 { 1935 Mat_SeqSELL *b; 1936 PetscMPIInt size; 1937 1938 PetscFunctionBegin; 1939 PetscCall(PetscCitationsRegister(citation, &cited)); 1940 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 1941 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 1942 1943 PetscCall(PetscNew(&b)); 1944 1945 B->data = (void *)b; 1946 1947 PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps))); 1948 1949 b->row = NULL; 1950 b->col = NULL; 1951 b->icol = NULL; 1952 b->reallocs = 0; 1953 b->ignorezeroentries = PETSC_FALSE; 1954 b->roworiented = PETSC_TRUE; 1955 b->nonew = 0; 1956 b->diag = NULL; 1957 b->solve_work = NULL; 1958 B->spptr = NULL; 1959 b->saved_values = NULL; 1960 b->idiag = NULL; 1961 b->mdiag = NULL; 1962 b->ssor_work = NULL; 1963 b->omega = 1.0; 1964 b->fshift = 0.0; 1965 b->idiagvalid = PETSC_FALSE; 1966 b->keepnonzeropattern = PETSC_FALSE; 1967 1968 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 1969 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 1970 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 1971 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 1972 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 1973 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 1974 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 1975 PetscFunctionReturn(PETSC_SUCCESS); 1976 } 1977 1978 /* 1979 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1980 */ 1981 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 1982 { 1983 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 1984 PetscInt i, m = A->rmap->n; 1985 PetscInt totalslices = a->totalslices; 1986 1987 PetscFunctionBegin; 1988 C->factortype = A->factortype; 1989 c->row = NULL; 1990 c->col = NULL; 1991 c->icol = NULL; 1992 c->reallocs = 0; 1993 C->assembled = PETSC_TRUE; 1994 1995 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 1996 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 1997 1998 PetscCall(PetscMalloc1(8 * totalslices, &c->rlen)); 1999 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2000 2001 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2002 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2003 2004 /* allocate the matrix space */ 2005 if (mallocmatspace) { 2006 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2007 2008 c->singlemalloc = PETSC_TRUE; 2009 2010 if (m > 0) { 2011 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2012 if (cpvalues == MAT_COPY_VALUES) { 2013 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2014 } else { 2015 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2016 } 2017 } 2018 } 2019 2020 c->ignorezeroentries = a->ignorezeroentries; 2021 c->roworiented = a->roworiented; 2022 c->nonew = a->nonew; 2023 if (a->diag) { 2024 PetscCall(PetscMalloc1(m, &c->diag)); 2025 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2026 } else c->diag = NULL; 2027 2028 c->solve_work = NULL; 2029 c->saved_values = NULL; 2030 c->idiag = NULL; 2031 c->ssor_work = NULL; 2032 c->keepnonzeropattern = a->keepnonzeropattern; 2033 c->free_val = PETSC_TRUE; 2034 c->free_colidx = PETSC_TRUE; 2035 2036 c->maxallocmat = a->maxallocmat; 2037 c->maxallocrow = a->maxallocrow; 2038 c->rlenmax = a->rlenmax; 2039 c->nz = a->nz; 2040 C->preallocated = PETSC_TRUE; 2041 2042 c->nonzerorowcnt = a->nonzerorowcnt; 2043 C->nonzerostate = A->nonzerostate; 2044 2045 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2046 PetscFunctionReturn(PETSC_SUCCESS); 2047 } 2048 2049 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2050 { 2051 PetscFunctionBegin; 2052 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2053 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2054 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2055 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2056 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2057 PetscFunctionReturn(PETSC_SUCCESS); 2058 } 2059 2060 /*MC 2061 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2062 based on the sliced Ellpack format 2063 2064 Options Database Key: 2065 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2066 2067 Level: beginner 2068 2069 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2070 M*/ 2071 2072 /*MC 2073 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2074 2075 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2076 and `MATMPISELL` otherwise. As a result, for single process communicators, 2077 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2078 for communicators controlling multiple processes. It is recommended that you call both of 2079 the above preallocation routines for simplicity. 2080 2081 Options Database Key: 2082 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2083 2084 Level: beginner 2085 2086 Notes: 2087 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2088 2089 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2090 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2091 2092 Developer Notes: 2093 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2094 2095 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2096 .vb 2097 (2 0 3 4) 2098 Consider the matrix A = (5 0 6 0) 2099 (0 0 7 8) 2100 (0 0 9 9) 2101 2102 symbolically the Ellpack format can be written as 2103 2104 (2 3 4 |) (0 2 3 |) 2105 v = (5 6 0 |) colidx = (0 2 2 |) 2106 -------- --------- 2107 (7 8 |) (2 3 |) 2108 (9 9 |) (2 3 |) 2109 2110 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2111 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2112 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2113 2114 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2115 2116 .ve 2117 2118 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2119 2120 References: 2121 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2122 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2123 2124 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2125 M*/ 2126 2127 /*@C 2128 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2129 2130 Collective 2131 2132 Input Parameters: 2133 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2134 . m - number of rows 2135 . n - number of columns 2136 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided 2137 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL 2138 2139 Output Parameter: 2140 . A - the matrix 2141 2142 Level: intermediate 2143 2144 Notes: 2145 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2146 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2147 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2148 2149 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 2150 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 2151 allocation. 2152 2153 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2154 @*/ 2155 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A) 2156 { 2157 PetscFunctionBegin; 2158 PetscCall(MatCreate(comm, A)); 2159 PetscCall(MatSetSizes(*A, m, n, m, n)); 2160 PetscCall(MatSetType(*A, MATSEQSELL)); 2161 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen)); 2162 PetscFunctionReturn(PETSC_SUCCESS); 2163 } 2164 2165 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2166 { 2167 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2168 PetscInt totalslices = a->totalslices; 2169 2170 PetscFunctionBegin; 2171 /* If the matrix dimensions are not equal,or no of nonzeros */ 2172 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2173 *flg = PETSC_FALSE; 2174 PetscFunctionReturn(PETSC_SUCCESS); 2175 } 2176 /* if the a->colidx are the same */ 2177 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2178 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2179 /* if a->val are the same */ 2180 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2181 PetscFunctionReturn(PETSC_SUCCESS); 2182 } 2183 2184 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2185 { 2186 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2187 2188 PetscFunctionBegin; 2189 a->idiagvalid = PETSC_FALSE; 2190 PetscFunctionReturn(PETSC_SUCCESS); 2191 } 2192 2193 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2194 { 2195 #if defined(PETSC_USE_COMPLEX) 2196 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2197 PetscInt i; 2198 PetscScalar *val = a->val; 2199 2200 PetscFunctionBegin; 2201 for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]); 2202 #else 2203 PetscFunctionBegin; 2204 #endif 2205 PetscFunctionReturn(PETSC_SUCCESS); 2206 } 2207