1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 11 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 12 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 13 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 14 " year = 2018\n" 15 "}\n"; 16 17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 18 19 #include <immintrin.h> 20 21 #if !defined(_MM_SCALE_8) 22 #define _MM_SCALE_8 8 23 #endif 24 25 #if defined(__AVX512F__) 26 /* these do not work 27 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 28 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 29 */ 30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 31 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 32 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 33 vec_vals = _mm512_loadu_pd(aval); \ 34 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 35 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 36 #elif defined(__AVX2__) && defined(__FMA__) 37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 38 vec_vals = _mm256_loadu_pd(aval); \ 39 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 40 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 41 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 42 #endif 43 #endif /* PETSC_HAVE_IMMINTRIN_H */ 44 45 /*@C 46 MatSeqSELLSetPreallocation - For good matrix assembly performance 47 the user should preallocate the matrix storage by setting the parameter nz 48 (or the array nnz). By setting these parameters accurately, performance 49 during matrix assembly can be increased significantly. 50 51 Collective 52 53 Input Parameters: 54 + B - The `MATSEQSELL` matrix 55 . nz - number of nonzeros per row (same for all rows) 56 - nnz - array containing the number of nonzeros in the various rows 57 (possibly different for each row) or NULL 58 59 Notes: 60 If nnz is given then nz is ignored. 61 62 Specify the preallocated storage with either nz or nnz (not both). 63 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 64 allocation. For large problems you MUST preallocate memory or you 65 will get TERRIBLE performance, see the users' manual chapter on matrices. 66 67 You can call `MatGetInfo()` to get information on how effective the preallocation was; 68 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 69 You can also run with the option -info and look for messages with the string 70 malloc in them to see if additional memory allocation was needed. 71 72 Developers: Use nz of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 73 entries or columns indices. 74 75 The maximum number of nonzeos in any row should be as accurate as possible. 76 If it is underestimated, you will get bad performance due to reallocation 77 (MatSeqXSELLReallocateSELL). 78 79 Level: intermediate 80 81 .seealso: `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 82 @*/ 83 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) { 84 PetscFunctionBegin; 85 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 86 PetscValidType(B, 1); 87 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 88 PetscFunctionReturn(0); 89 } 90 91 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) { 92 Mat_SeqSELL *b; 93 PetscInt i, j, totalslices; 94 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 95 96 PetscFunctionBegin; 97 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 98 if (maxallocrow == MAT_SKIP_ALLOCATION) { 99 skipallocation = PETSC_TRUE; 100 maxallocrow = 0; 101 } 102 103 PetscCall(PetscLayoutSetUp(B->rmap)); 104 PetscCall(PetscLayoutSetUp(B->cmap)); 105 106 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 107 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 108 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 109 if (rlen) { 110 for (i = 0; i < B->rmap->n; i++) { 111 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 112 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 113 } 114 } 115 116 B->preallocated = PETSC_TRUE; 117 118 b = (Mat_SeqSELL *)B->data; 119 120 totalslices = PetscCeilInt(B->rmap->n, 8); 121 b->totalslices = totalslices; 122 if (!skipallocation) { 123 if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n)); 124 125 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 126 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 127 PetscCall(PetscLogObjectMemory((PetscObject)B, (totalslices + 1) * sizeof(PetscInt))); 128 } 129 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 130 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 131 else if (maxallocrow < 0) maxallocrow = 1; 132 for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow; 133 } else { 134 maxallocrow = 0; 135 b->sliidx[0] = 0; 136 for (i = 1; i < totalslices; i++) { 137 b->sliidx[i] = 0; 138 for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]); 139 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 140 PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i])); 141 } 142 /* last slice */ 143 b->sliidx[totalslices] = 0; 144 for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 145 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 146 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices]; 147 } 148 149 /* allocate space for val, colidx, rlen */ 150 /* FIXME: should B's old memory be unlogged? */ 151 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 152 /* FIXME: assuming an element of the bit array takes 8 bits */ 153 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 154 PetscCall(PetscLogObjectMemory((PetscObject)B, b->sliidx[totalslices] * (sizeof(PetscScalar) + sizeof(PetscInt)))); 155 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 156 PetscCall(PetscCalloc1(8 * totalslices, &b->rlen)); 157 PetscCall(PetscLogObjectMemory((PetscObject)B, 8 * totalslices * sizeof(PetscInt))); 158 159 b->singlemalloc = PETSC_TRUE; 160 b->free_val = PETSC_TRUE; 161 b->free_colidx = PETSC_TRUE; 162 } else { 163 b->free_val = PETSC_FALSE; 164 b->free_colidx = PETSC_FALSE; 165 } 166 167 b->nz = 0; 168 b->maxallocrow = maxallocrow; 169 b->rlenmax = maxallocrow; 170 b->maxallocmat = b->sliidx[totalslices]; 171 B->info.nz_unneeded = (double)b->maxallocmat; 172 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 173 PetscFunctionReturn(0); 174 } 175 176 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) { 177 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 178 PetscInt shift; 179 180 PetscFunctionBegin; 181 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 182 if (nz) *nz = a->rlen[row]; 183 shift = a->sliidx[row >> 3] + (row & 0x07); 184 if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); 185 if (idx) { 186 PetscInt j; 187 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j]; 188 *idx = a->getrowcols; 189 } 190 if (v) { 191 PetscInt j; 192 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j]; 193 *v = a->getrowvals; 194 } 195 PetscFunctionReturn(0); 196 } 197 198 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) { 199 PetscFunctionBegin; 200 PetscFunctionReturn(0); 201 } 202 203 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) { 204 Mat B; 205 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 206 PetscInt i; 207 208 PetscFunctionBegin; 209 if (reuse == MAT_REUSE_MATRIX) { 210 B = *newmat; 211 PetscCall(MatZeroEntries(B)); 212 } else { 213 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 214 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 215 PetscCall(MatSetType(B, MATSEQAIJ)); 216 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 217 } 218 219 for (i = 0; i < A->rmap->n; i++) { 220 PetscInt nz = 0, *cols = NULL; 221 PetscScalar *vals = NULL; 222 223 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 224 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 225 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 226 } 227 228 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 229 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 230 B->rmap->bs = A->rmap->bs; 231 232 if (reuse == MAT_INPLACE_MATRIX) { 233 PetscCall(MatHeaderReplace(A, &B)); 234 } else { 235 *newmat = B; 236 } 237 PetscFunctionReturn(0); 238 } 239 240 #include <../src/mat/impls/aij/seq/aij.h> 241 242 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) { 243 Mat B; 244 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 245 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 246 const PetscInt *cols; 247 const PetscScalar *vals; 248 249 PetscFunctionBegin; 250 251 if (reuse == MAT_REUSE_MATRIX) { 252 B = *newmat; 253 } else { 254 if (PetscDefined(USE_DEBUG) || !a->ilen) { 255 PetscCall(PetscMalloc1(m, &rowlengths)); 256 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 257 } 258 if (PetscDefined(USE_DEBUG) && a->ilen) { 259 PetscBool eq; 260 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 261 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 262 PetscCall(PetscFree(rowlengths)); 263 rowlengths = a->ilen; 264 } else if (a->ilen) rowlengths = a->ilen; 265 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 266 PetscCall(MatSetSizes(B, m, n, m, n)); 267 PetscCall(MatSetType(B, MATSEQSELL)); 268 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 269 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 270 } 271 272 for (row = 0; row < m; row++) { 273 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 274 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 275 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 276 } 277 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 278 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 279 B->rmap->bs = A->rmap->bs; 280 281 if (reuse == MAT_INPLACE_MATRIX) { 282 PetscCall(MatHeaderReplace(A, &B)); 283 } else { 284 *newmat = B; 285 } 286 PetscFunctionReturn(0); 287 } 288 289 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) { 290 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 291 PetscScalar *y; 292 const PetscScalar *x; 293 const MatScalar *aval = a->val; 294 PetscInt totalslices = a->totalslices; 295 const PetscInt *acolidx = a->colidx; 296 PetscInt i, j; 297 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 298 __m512d vec_x, vec_y, vec_vals; 299 __m256i vec_idx; 300 __mmask8 mask; 301 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 302 __m256i vec_idx2, vec_idx3, vec_idx4; 303 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 304 __m128i vec_idx; 305 __m256d vec_x, vec_y, vec_y2, vec_vals; 306 MatScalar yval; 307 PetscInt r, rows_left, row, nnz_in_row; 308 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 309 __m128d vec_x_tmp; 310 __m256d vec_x, vec_y, vec_y2, vec_vals; 311 MatScalar yval; 312 PetscInt r, rows_left, row, nnz_in_row; 313 #else 314 PetscScalar sum[8]; 315 #endif 316 317 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 318 #pragma disjoint(*x, *y, *aval) 319 #endif 320 321 PetscFunctionBegin; 322 PetscCall(VecGetArrayRead(xx, &x)); 323 PetscCall(VecGetArray(yy, &y)); 324 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 325 for (i = 0; i < totalslices; i++) { /* loop over slices */ 326 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 327 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 328 329 vec_y = _mm512_setzero_pd(); 330 vec_y2 = _mm512_setzero_pd(); 331 vec_y3 = _mm512_setzero_pd(); 332 vec_y4 = _mm512_setzero_pd(); 333 334 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 335 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 336 case 3: 337 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 338 acolidx += 8; 339 aval += 8; 340 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 341 acolidx += 8; 342 aval += 8; 343 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 344 acolidx += 8; 345 aval += 8; 346 j += 3; 347 break; 348 case 2: 349 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 350 acolidx += 8; 351 aval += 8; 352 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 353 acolidx += 8; 354 aval += 8; 355 j += 2; 356 break; 357 case 1: 358 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 359 acolidx += 8; 360 aval += 8; 361 j += 1; 362 break; 363 } 364 #pragma novector 365 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 366 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 367 acolidx += 8; 368 aval += 8; 369 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 370 acolidx += 8; 371 aval += 8; 372 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 373 acolidx += 8; 374 aval += 8; 375 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 376 acolidx += 8; 377 aval += 8; 378 } 379 380 vec_y = _mm512_add_pd(vec_y, vec_y2); 381 vec_y = _mm512_add_pd(vec_y, vec_y3); 382 vec_y = _mm512_add_pd(vec_y, vec_y4); 383 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 384 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 385 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 386 } else { 387 _mm512_storeu_pd(&y[8 * i], vec_y); 388 } 389 } 390 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 391 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 392 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 393 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 394 395 /* last slice may have padding rows. Don't use vectorization. */ 396 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 397 rows_left = A->rmap->n - 8 * i; 398 for (r = 0; r < rows_left; ++r) { 399 yval = (MatScalar)0; 400 row = 8 * i + r; 401 nnz_in_row = a->rlen[row]; 402 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 403 y[row] = yval; 404 } 405 break; 406 } 407 408 vec_y = _mm256_setzero_pd(); 409 vec_y2 = _mm256_setzero_pd(); 410 411 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 412 #pragma novector 413 #pragma unroll(2) 414 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 415 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 416 aval += 4; 417 acolidx += 4; 418 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 419 aval += 4; 420 acolidx += 4; 421 } 422 423 _mm256_storeu_pd(y + i * 8, vec_y); 424 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 425 } 426 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 427 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 428 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 429 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 430 431 vec_y = _mm256_setzero_pd(); 432 vec_y2 = _mm256_setzero_pd(); 433 434 /* last slice may have padding rows. Don't use vectorization. */ 435 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 436 rows_left = A->rmap->n - 8 * i; 437 for (r = 0; r < rows_left; ++r) { 438 yval = (MatScalar)0; 439 row = 8 * i + r; 440 nnz_in_row = a->rlen[row]; 441 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 442 y[row] = yval; 443 } 444 break; 445 } 446 447 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 448 #pragma novector 449 #pragma unroll(2) 450 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 451 vec_vals = _mm256_loadu_pd(aval); 452 vec_x_tmp = _mm_setzero_pd(); 453 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 454 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 455 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 456 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 457 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 458 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 459 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 460 aval += 4; 461 462 vec_vals = _mm256_loadu_pd(aval); 463 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 464 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 465 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 466 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 467 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 468 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 469 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 470 aval += 4; 471 } 472 473 _mm256_storeu_pd(y + i * 8, vec_y); 474 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 475 } 476 #else 477 for (i = 0; i < totalslices; i++) { /* loop over slices */ 478 for (j = 0; j < 8; j++) sum[j] = 0.0; 479 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 480 sum[0] += aval[j] * x[acolidx[j]]; 481 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 482 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 483 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 484 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 485 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 486 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 487 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 488 } 489 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 490 for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j]; 491 } else { 492 for (j = 0; j < 8; j++) y[8 * i + j] = sum[j]; 493 } 494 } 495 #endif 496 497 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 498 PetscCall(VecRestoreArrayRead(xx, &x)); 499 PetscCall(VecRestoreArray(yy, &y)); 500 PetscFunctionReturn(0); 501 } 502 503 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 504 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) { 505 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 506 PetscScalar *y, *z; 507 const PetscScalar *x; 508 const MatScalar *aval = a->val; 509 PetscInt totalslices = a->totalslices; 510 const PetscInt *acolidx = a->colidx; 511 PetscInt i, j; 512 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 513 __m512d vec_x, vec_y, vec_vals; 514 __m256i vec_idx; 515 __mmask8 mask; 516 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 517 __m256i vec_idx2, vec_idx3, vec_idx4; 518 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 519 __m128d vec_x_tmp; 520 __m256d vec_x, vec_y, vec_y2, vec_vals; 521 MatScalar yval; 522 PetscInt r, row, nnz_in_row; 523 #else 524 PetscScalar sum[8]; 525 #endif 526 527 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 528 #pragma disjoint(*x, *y, *aval) 529 #endif 530 531 PetscFunctionBegin; 532 PetscCall(VecGetArrayRead(xx, &x)); 533 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 534 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 535 for (i = 0; i < totalslices; i++) { /* loop over slices */ 536 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 537 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 538 539 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 540 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 541 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 542 } else { 543 vec_y = _mm512_loadu_pd(&y[8 * i]); 544 } 545 vec_y2 = _mm512_setzero_pd(); 546 vec_y3 = _mm512_setzero_pd(); 547 vec_y4 = _mm512_setzero_pd(); 548 549 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 550 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 551 case 3: 552 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 553 acolidx += 8; 554 aval += 8; 555 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 556 acolidx += 8; 557 aval += 8; 558 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 559 acolidx += 8; 560 aval += 8; 561 j += 3; 562 break; 563 case 2: 564 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 565 acolidx += 8; 566 aval += 8; 567 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 568 acolidx += 8; 569 aval += 8; 570 j += 2; 571 break; 572 case 1: 573 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 574 acolidx += 8; 575 aval += 8; 576 j += 1; 577 break; 578 } 579 #pragma novector 580 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 581 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 582 acolidx += 8; 583 aval += 8; 584 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 585 acolidx += 8; 586 aval += 8; 587 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 588 acolidx += 8; 589 aval += 8; 590 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 591 acolidx += 8; 592 aval += 8; 593 } 594 595 vec_y = _mm512_add_pd(vec_y, vec_y2); 596 vec_y = _mm512_add_pd(vec_y, vec_y3); 597 vec_y = _mm512_add_pd(vec_y, vec_y4); 598 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 599 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 600 } else { 601 _mm512_storeu_pd(&z[8 * i], vec_y); 602 } 603 } 604 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 605 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 606 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 607 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 608 609 /* last slice may have padding rows. Don't use vectorization. */ 610 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 611 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 612 row = 8 * i + r; 613 yval = (MatScalar)0.0; 614 nnz_in_row = a->rlen[row]; 615 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 616 z[row] = y[row] + yval; 617 } 618 break; 619 } 620 621 vec_y = _mm256_loadu_pd(y + 8 * i); 622 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 623 624 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 625 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 626 vec_vals = _mm256_loadu_pd(aval); 627 vec_x_tmp = _mm_setzero_pd(); 628 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 629 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 630 vec_x = _mm256_setzero_pd(); 631 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 632 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 633 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 634 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 635 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 636 aval += 4; 637 638 vec_vals = _mm256_loadu_pd(aval); 639 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 640 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 641 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 642 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 643 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 644 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 645 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 646 aval += 4; 647 } 648 649 _mm256_storeu_pd(z + i * 8, vec_y); 650 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 651 } 652 #else 653 for (i = 0; i < totalslices; i++) { /* loop over slices */ 654 for (j = 0; j < 8; j++) sum[j] = 0.0; 655 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 656 sum[0] += aval[j] * x[acolidx[j]]; 657 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 658 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 659 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 660 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 661 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 662 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 663 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 664 } 665 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 666 for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 667 } else { 668 for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 669 } 670 } 671 #endif 672 673 PetscCall(PetscLogFlops(2.0 * a->nz)); 674 PetscCall(VecRestoreArrayRead(xx, &x)); 675 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 676 PetscFunctionReturn(0); 677 } 678 679 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) { 680 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 681 PetscScalar *y; 682 const PetscScalar *x; 683 const MatScalar *aval = a->val; 684 const PetscInt *acolidx = a->colidx; 685 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices; 686 687 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 688 #pragma disjoint(*x, *y, *aval) 689 #endif 690 691 PetscFunctionBegin; 692 if (A->symmetric == PETSC_BOOL3_TRUE) { 693 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 694 PetscFunctionReturn(0); 695 } 696 if (zz != yy) PetscCall(VecCopy(zz, yy)); 697 PetscCall(VecGetArrayRead(xx, &x)); 698 PetscCall(VecGetArray(yy, &y)); 699 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 700 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 701 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 702 row = 8 * i + r; 703 nnz_in_row = a->rlen[row]; 704 for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row]; 705 } 706 break; 707 } 708 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 709 y[acolidx[j]] += aval[j] * x[8 * i]; 710 y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1]; 711 y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2]; 712 y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3]; 713 y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4]; 714 y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5]; 715 y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6]; 716 y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7]; 717 } 718 } 719 PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices])); 720 PetscCall(VecRestoreArrayRead(xx, &x)); 721 PetscCall(VecRestoreArray(yy, &y)); 722 PetscFunctionReturn(0); 723 } 724 725 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) { 726 PetscFunctionBegin; 727 if (A->symmetric == PETSC_BOOL3_TRUE) { 728 PetscCall(MatMult_SeqSELL(A, xx, yy)); 729 } else { 730 PetscCall(VecSet(yy, 0.0)); 731 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 732 } 733 PetscFunctionReturn(0); 734 } 735 736 /* 737 Checks for missing diagonals 738 */ 739 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) { 740 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 741 PetscInt *diag, i; 742 743 PetscFunctionBegin; 744 *missing = PETSC_FALSE; 745 if (A->rmap->n > 0 && !(a->colidx)) { 746 *missing = PETSC_TRUE; 747 if (d) *d = 0; 748 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 749 } else { 750 diag = a->diag; 751 for (i = 0; i < A->rmap->n; i++) { 752 if (diag[i] == -1) { 753 *missing = PETSC_TRUE; 754 if (d) *d = i; 755 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 756 break; 757 } 758 } 759 } 760 PetscFunctionReturn(0); 761 } 762 763 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) { 764 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 765 PetscInt i, j, m = A->rmap->n, shift; 766 767 PetscFunctionBegin; 768 if (!a->diag) { 769 PetscCall(PetscMalloc1(m, &a->diag)); 770 PetscCall(PetscLogObjectMemory((PetscObject)A, m * sizeof(PetscInt))); 771 a->free_diag = PETSC_TRUE; 772 } 773 for (i = 0; i < m; i++) { /* loop over rows */ 774 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 775 a->diag[i] = -1; 776 for (j = 0; j < a->rlen[i]; j++) { 777 if (a->colidx[shift + j * 8] == i) { 778 a->diag[i] = shift + j * 8; 779 break; 780 } 781 } 782 } 783 PetscFunctionReturn(0); 784 } 785 786 /* 787 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 788 */ 789 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) { 790 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 791 PetscInt i, *diag, m = A->rmap->n; 792 MatScalar *val = a->val; 793 PetscScalar *idiag, *mdiag; 794 795 PetscFunctionBegin; 796 if (a->idiagvalid) PetscFunctionReturn(0); 797 PetscCall(MatMarkDiagonal_SeqSELL(A)); 798 diag = a->diag; 799 if (!a->idiag) { 800 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 801 PetscCall(PetscLogObjectMemory((PetscObject)A, 3 * m * sizeof(PetscScalar))); 802 val = a->val; 803 } 804 mdiag = a->mdiag; 805 idiag = a->idiag; 806 807 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 808 for (i = 0; i < m; i++) { 809 mdiag[i] = val[diag[i]]; 810 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 811 if (PetscRealPart(fshift)) { 812 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 813 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 814 A->factorerror_zeropivot_value = 0.0; 815 A->factorerror_zeropivot_row = i; 816 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 817 } 818 idiag[i] = 1.0 / val[diag[i]]; 819 } 820 PetscCall(PetscLogFlops(m)); 821 } else { 822 for (i = 0; i < m; i++) { 823 mdiag[i] = val[diag[i]]; 824 idiag[i] = omega / (fshift + val[diag[i]]); 825 } 826 PetscCall(PetscLogFlops(2.0 * m)); 827 } 828 a->idiagvalid = PETSC_TRUE; 829 PetscFunctionReturn(0); 830 } 831 832 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) { 833 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 834 835 PetscFunctionBegin; 836 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 837 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 838 PetscFunctionReturn(0); 839 } 840 841 PetscErrorCode MatDestroy_SeqSELL(Mat A) { 842 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 843 844 PetscFunctionBegin; 845 #if defined(PETSC_USE_LOG) 846 PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz); 847 #endif 848 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 849 PetscCall(ISDestroy(&a->row)); 850 PetscCall(ISDestroy(&a->col)); 851 PetscCall(PetscFree(a->diag)); 852 PetscCall(PetscFree(a->rlen)); 853 PetscCall(PetscFree(a->sliidx)); 854 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 855 PetscCall(PetscFree(a->solve_work)); 856 PetscCall(ISDestroy(&a->icol)); 857 PetscCall(PetscFree(a->saved_values)); 858 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 859 860 PetscCall(PetscFree(A->data)); 861 862 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 863 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 864 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 865 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 866 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 867 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 868 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 869 PetscFunctionReturn(0); 870 } 871 872 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) { 873 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 874 875 PetscFunctionBegin; 876 switch (op) { 877 case MAT_ROW_ORIENTED: a->roworiented = flg; break; 878 case MAT_KEEP_NONZERO_PATTERN: a->keepnonzeropattern = flg; break; 879 case MAT_NEW_NONZERO_LOCATIONS: a->nonew = (flg ? 0 : 1); break; 880 case MAT_NEW_NONZERO_LOCATION_ERR: a->nonew = (flg ? -1 : 0); break; 881 case MAT_NEW_NONZERO_ALLOCATION_ERR: a->nonew = (flg ? -2 : 0); break; 882 case MAT_UNUSED_NONZERO_LOCATION_ERR: a->nounused = (flg ? -1 : 0); break; 883 case MAT_FORCE_DIAGONAL_ENTRIES: 884 case MAT_IGNORE_OFF_PROC_ENTRIES: 885 case MAT_USE_HASH_TABLE: 886 case MAT_SORTED_FULL: PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); break; 887 case MAT_SPD: 888 case MAT_SYMMETRIC: 889 case MAT_STRUCTURALLY_SYMMETRIC: 890 case MAT_HERMITIAN: 891 case MAT_SYMMETRY_ETERNAL: 892 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 893 case MAT_SPD_ETERNAL: 894 /* These options are handled directly by MatSetOption() */ 895 break; 896 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 897 } 898 PetscFunctionReturn(0); 899 } 900 901 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) { 902 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 903 PetscInt i, j, n, shift; 904 PetscScalar *x, zero = 0.0; 905 906 PetscFunctionBegin; 907 PetscCall(VecGetLocalSize(v, &n)); 908 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 909 910 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 911 PetscInt *diag = a->diag; 912 PetscCall(VecGetArray(v, &x)); 913 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 914 PetscCall(VecRestoreArray(v, &x)); 915 PetscFunctionReturn(0); 916 } 917 918 PetscCall(VecSet(v, zero)); 919 PetscCall(VecGetArray(v, &x)); 920 for (i = 0; i < n; i++) { /* loop over rows */ 921 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 922 x[i] = 0; 923 for (j = 0; j < a->rlen[i]; j++) { 924 if (a->colidx[shift + j * 8] == i) { 925 x[i] = a->val[shift + j * 8]; 926 break; 927 } 928 } 929 } 930 PetscCall(VecRestoreArray(v, &x)); 931 PetscFunctionReturn(0); 932 } 933 934 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) { 935 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 936 const PetscScalar *l, *r; 937 PetscInt i, j, m, n, row; 938 939 PetscFunctionBegin; 940 if (ll) { 941 /* The local size is used so that VecMPI can be passed to this routine 942 by MatDiagonalScale_MPISELL */ 943 PetscCall(VecGetLocalSize(ll, &m)); 944 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 945 PetscCall(VecGetArrayRead(ll, &l)); 946 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 947 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 948 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 949 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row]; 950 } 951 } else { 952 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row]; 953 } 954 } 955 PetscCall(VecRestoreArrayRead(ll, &l)); 956 PetscCall(PetscLogFlops(a->nz)); 957 } 958 if (rr) { 959 PetscCall(VecGetLocalSize(rr, &n)); 960 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 961 PetscCall(VecGetArrayRead(rr, &r)); 962 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 963 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 964 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 965 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 966 } 967 } else { 968 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 969 } 970 } 971 PetscCall(VecRestoreArrayRead(rr, &r)); 972 PetscCall(PetscLogFlops(a->nz)); 973 } 974 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 975 PetscFunctionReturn(0); 976 } 977 978 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) { 979 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 980 PetscInt *cp, i, k, low, high, t, row, col, l; 981 PetscInt shift; 982 MatScalar *vp; 983 984 PetscFunctionBegin; 985 for (k = 0; k < m; k++) { /* loop over requested rows */ 986 row = im[k]; 987 if (row < 0) continue; 988 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 989 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 990 cp = a->colidx + shift; /* pointer to the row */ 991 vp = a->val + shift; /* pointer to the row */ 992 for (l = 0; l < n; l++) { /* loop over requested columns */ 993 col = in[l]; 994 if (col < 0) continue; 995 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 996 high = a->rlen[row]; 997 low = 0; /* assume unsorted */ 998 while (high - low > 5) { 999 t = (low + high) / 2; 1000 if (*(cp + t * 8) > col) high = t; 1001 else low = t; 1002 } 1003 for (i = low; i < high; i++) { 1004 if (*(cp + 8 * i) > col) break; 1005 if (*(cp + 8 * i) == col) { 1006 *v++ = *(vp + 8 * i); 1007 goto finished; 1008 } 1009 } 1010 *v++ = 0.0; 1011 finished:; 1012 } 1013 } 1014 PetscFunctionReturn(0); 1015 } 1016 1017 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) { 1018 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1019 PetscInt i, j, m = A->rmap->n, shift; 1020 const char *name; 1021 PetscViewerFormat format; 1022 1023 PetscFunctionBegin; 1024 PetscCall(PetscViewerGetFormat(viewer, &format)); 1025 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1026 PetscInt nofinalvalue = 0; 1027 /* 1028 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1029 nofinalvalue = 1; 1030 } 1031 */ 1032 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1033 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1034 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1035 #if defined(PETSC_USE_COMPLEX) 1036 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1037 #else 1038 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1039 #endif 1040 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1041 1042 for (i = 0; i < m; i++) { 1043 shift = a->sliidx[i >> 3] + (i & 0x07); 1044 for (j = 0; j < a->rlen[i]; j++) { 1045 #if defined(PETSC_USE_COMPLEX) 1046 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1047 #else 1048 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j])); 1049 #endif 1050 } 1051 } 1052 /* 1053 if (nofinalvalue) { 1054 #if defined(PETSC_USE_COMPLEX) 1055 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1056 #else 1057 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1058 #endif 1059 } 1060 */ 1061 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1062 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1063 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1064 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1065 PetscFunctionReturn(0); 1066 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1067 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1068 for (i = 0; i < m; i++) { 1069 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1070 shift = a->sliidx[i >> 3] + (i & 0x07); 1071 for (j = 0; j < a->rlen[i]; j++) { 1072 #if defined(PETSC_USE_COMPLEX) 1073 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1074 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1075 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1076 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1077 } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1078 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1079 } 1080 #else 1081 if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1082 #endif 1083 } 1084 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1085 } 1086 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1087 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1088 PetscInt cnt = 0, jcnt; 1089 PetscScalar value; 1090 #if defined(PETSC_USE_COMPLEX) 1091 PetscBool realonly = PETSC_TRUE; 1092 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1093 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1094 realonly = PETSC_FALSE; 1095 break; 1096 } 1097 } 1098 #endif 1099 1100 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1101 for (i = 0; i < m; i++) { 1102 jcnt = 0; 1103 shift = a->sliidx[i >> 3] + (i & 0x07); 1104 for (j = 0; j < A->cmap->n; j++) { 1105 if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) { 1106 value = a->val[cnt++]; 1107 jcnt++; 1108 } else { 1109 value = 0.0; 1110 } 1111 #if defined(PETSC_USE_COMPLEX) 1112 if (realonly) { 1113 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1114 } else { 1115 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1116 } 1117 #else 1118 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1119 #endif 1120 } 1121 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1122 } 1123 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1124 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1125 PetscInt fshift = 1; 1126 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1127 #if defined(PETSC_USE_COMPLEX) 1128 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1129 #else 1130 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1131 #endif 1132 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1133 for (i = 0; i < m; i++) { 1134 shift = a->sliidx[i >> 3] + (i & 0x07); 1135 for (j = 0; j < a->rlen[i]; j++) { 1136 #if defined(PETSC_USE_COMPLEX) 1137 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1138 #else 1139 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j])); 1140 #endif 1141 } 1142 } 1143 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1144 } else if (format == PETSC_VIEWER_NATIVE) { 1145 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1146 PetscInt row; 1147 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1148 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 1149 #if defined(PETSC_USE_COMPLEX) 1150 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1151 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1152 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1153 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1154 } else { 1155 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1156 } 1157 #else 1158 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j])); 1159 #endif 1160 } 1161 } 1162 } else { 1163 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1164 if (A->factortype) { 1165 for (i = 0; i < m; i++) { 1166 shift = a->sliidx[i >> 3] + (i & 0x07); 1167 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1168 /* L part */ 1169 for (j = shift; j < a->diag[i]; j += 8) { 1170 #if defined(PETSC_USE_COMPLEX) 1171 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) { 1172 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1173 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) { 1174 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1175 } else { 1176 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1177 } 1178 #else 1179 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1180 #endif 1181 } 1182 /* diagonal */ 1183 j = a->diag[i]; 1184 #if defined(PETSC_USE_COMPLEX) 1185 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1186 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1187 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1188 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1189 } else { 1190 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1191 } 1192 #else 1193 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1194 #endif 1195 1196 /* U part */ 1197 for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) { 1198 #if defined(PETSC_USE_COMPLEX) 1199 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1200 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1201 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1202 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1203 } else { 1204 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1205 } 1206 #else 1207 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1208 #endif 1209 } 1210 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1211 } 1212 } else { 1213 for (i = 0; i < m; i++) { 1214 shift = a->sliidx[i >> 3] + (i & 0x07); 1215 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1216 for (j = 0; j < a->rlen[i]; j++) { 1217 #if defined(PETSC_USE_COMPLEX) 1218 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1219 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1220 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1221 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1222 } else { 1223 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1224 } 1225 #else 1226 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1227 #endif 1228 } 1229 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1230 } 1231 } 1232 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1233 } 1234 PetscCall(PetscViewerFlush(viewer)); 1235 PetscFunctionReturn(0); 1236 } 1237 1238 #include <petscdraw.h> 1239 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) { 1240 Mat A = (Mat)Aa; 1241 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1242 PetscInt i, j, m = A->rmap->n, shift; 1243 int color; 1244 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1245 PetscViewer viewer; 1246 PetscViewerFormat format; 1247 1248 PetscFunctionBegin; 1249 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1250 PetscCall(PetscViewerGetFormat(viewer, &format)); 1251 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1252 1253 /* loop over matrix elements drawing boxes */ 1254 1255 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1256 PetscDrawCollectiveBegin(draw); 1257 /* Blue for negative, Cyan for zero and Red for positive */ 1258 color = PETSC_DRAW_BLUE; 1259 for (i = 0; i < m; i++) { 1260 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1261 y_l = m - i - 1.0; 1262 y_r = y_l + 1.0; 1263 for (j = 0; j < a->rlen[i]; j++) { 1264 x_l = a->colidx[shift + j * 8]; 1265 x_r = x_l + 1.0; 1266 if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue; 1267 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1268 } 1269 } 1270 color = PETSC_DRAW_CYAN; 1271 for (i = 0; i < m; i++) { 1272 shift = a->sliidx[i >> 3] + (i & 0x07); 1273 y_l = m - i - 1.0; 1274 y_r = y_l + 1.0; 1275 for (j = 0; j < a->rlen[i]; j++) { 1276 x_l = a->colidx[shift + j * 8]; 1277 x_r = x_l + 1.0; 1278 if (a->val[shift + 8 * j] != 0.) continue; 1279 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1280 } 1281 } 1282 color = PETSC_DRAW_RED; 1283 for (i = 0; i < m; i++) { 1284 shift = a->sliidx[i >> 3] + (i & 0x07); 1285 y_l = m - i - 1.0; 1286 y_r = y_l + 1.0; 1287 for (j = 0; j < a->rlen[i]; j++) { 1288 x_l = a->colidx[shift + j * 8]; 1289 x_r = x_l + 1.0; 1290 if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue; 1291 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1292 } 1293 } 1294 PetscDrawCollectiveEnd(draw); 1295 } else { 1296 /* use contour shading to indicate magnitude of values */ 1297 /* first determine max of all nonzero values */ 1298 PetscReal minv = 0.0, maxv = 0.0; 1299 PetscInt count = 0; 1300 PetscDraw popup; 1301 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1302 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1303 } 1304 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1305 PetscCall(PetscDrawGetPopup(draw, &popup)); 1306 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1307 1308 PetscDrawCollectiveBegin(draw); 1309 for (i = 0; i < m; i++) { 1310 shift = a->sliidx[i >> 3] + (i & 0x07); 1311 y_l = m - i - 1.0; 1312 y_r = y_l + 1.0; 1313 for (j = 0; j < a->rlen[i]; j++) { 1314 x_l = a->colidx[shift + j * 8]; 1315 x_r = x_l + 1.0; 1316 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1317 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1318 count++; 1319 } 1320 } 1321 PetscDrawCollectiveEnd(draw); 1322 } 1323 PetscFunctionReturn(0); 1324 } 1325 1326 #include <petscdraw.h> 1327 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) { 1328 PetscDraw draw; 1329 PetscReal xr, yr, xl, yl, h, w; 1330 PetscBool isnull; 1331 1332 PetscFunctionBegin; 1333 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1334 PetscCall(PetscDrawIsNull(draw, &isnull)); 1335 if (isnull) PetscFunctionReturn(0); 1336 1337 xr = A->cmap->n; 1338 yr = A->rmap->n; 1339 h = yr / 10.0; 1340 w = xr / 10.0; 1341 xr += w; 1342 yr += h; 1343 xl = -w; 1344 yl = -h; 1345 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1346 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1347 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1348 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1349 PetscCall(PetscDrawSave(draw)); 1350 PetscFunctionReturn(0); 1351 } 1352 1353 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) { 1354 PetscBool iascii, isbinary, isdraw; 1355 1356 PetscFunctionBegin; 1357 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1358 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1359 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1360 if (iascii) { 1361 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1362 } else if (isbinary) { 1363 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1364 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1365 PetscFunctionReturn(0); 1366 } 1367 1368 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) { 1369 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1370 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1371 MatScalar *vp; 1372 1373 PetscFunctionBegin; 1374 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1375 /* To do: compress out the unused elements */ 1376 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1377 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1378 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1379 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1380 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1381 for (i = 0; i < a->totalslices; ++i) { 1382 shift = a->sliidx[i]; /* starting index of the slice */ 1383 cp = a->colidx + shift; /* pointer to the column indices of the slice */ 1384 vp = a->val + shift; /* pointer to the nonzero values of the slice */ 1385 for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */ 1386 row = 8 * i + row_in_slice; 1387 nrow = a->rlen[row]; /* number of nonzeros in row */ 1388 /* 1389 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1390 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1391 */ 1392 lastcol = 0; 1393 if (nrow > 0) { /* nonempty row */ 1394 lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1395 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1396 for (j = 1; j < 8; j++) { 1397 if (a->rlen[8 * i + j]) { 1398 lastcol = cp[j]; 1399 break; 1400 } 1401 } 1402 } else { 1403 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1404 } 1405 1406 for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) { 1407 cp[8 * k + row_in_slice] = lastcol; 1408 vp[8 * k + row_in_slice] = (MatScalar)0; 1409 } 1410 } 1411 } 1412 1413 A->info.mallocs += a->reallocs; 1414 a->reallocs = 0; 1415 1416 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1417 PetscFunctionReturn(0); 1418 } 1419 1420 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) { 1421 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1422 1423 PetscFunctionBegin; 1424 info->block_size = 1.0; 1425 info->nz_allocated = a->maxallocmat; 1426 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1427 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1428 info->assemblies = A->num_ass; 1429 info->mallocs = A->info.mallocs; 1430 info->memory = ((PetscObject)A)->mem; 1431 if (A->factortype) { 1432 info->fill_ratio_given = A->info.fill_ratio_given; 1433 info->fill_ratio_needed = A->info.fill_ratio_needed; 1434 info->factor_mallocs = A->info.factor_mallocs; 1435 } else { 1436 info->fill_ratio_given = 0; 1437 info->fill_ratio_needed = 0; 1438 info->factor_mallocs = 0; 1439 } 1440 PetscFunctionReturn(0); 1441 } 1442 1443 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) { 1444 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1445 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1446 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1447 MatScalar *vp, value; 1448 1449 PetscFunctionBegin; 1450 for (k = 0; k < m; k++) { /* loop over added rows */ 1451 row = im[k]; 1452 if (row < 0) continue; 1453 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1454 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 1455 cp = a->colidx + shift; /* pointer to the row */ 1456 vp = a->val + shift; /* pointer to the row */ 1457 nrow = a->rlen[row]; 1458 low = 0; 1459 high = nrow; 1460 1461 for (l = 0; l < n; l++) { /* loop over added columns */ 1462 col = in[l]; 1463 if (col < 0) continue; 1464 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1465 if (a->roworiented) { 1466 value = v[l + k * n]; 1467 } else { 1468 value = v[k + l * m]; 1469 } 1470 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1471 1472 /* search in this row for the specified column, i indicates the column to be set */ 1473 if (col <= lastcol) low = 0; 1474 else high = nrow; 1475 lastcol = col; 1476 while (high - low > 5) { 1477 t = (low + high) / 2; 1478 if (*(cp + t * 8) > col) high = t; 1479 else low = t; 1480 } 1481 for (i = low; i < high; i++) { 1482 if (*(cp + i * 8) > col) break; 1483 if (*(cp + i * 8) == col) { 1484 if (is == ADD_VALUES) *(vp + i * 8) += value; 1485 else *(vp + i * 8) = value; 1486 low = i + 1; 1487 goto noinsert; 1488 } 1489 } 1490 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1491 if (nonew == 1) goto noinsert; 1492 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1493 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1494 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar); 1495 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1496 for (ii = nrow - 1; ii >= i; ii--) { 1497 *(cp + (ii + 1) * 8) = *(cp + ii * 8); 1498 *(vp + (ii + 1) * 8) = *(vp + ii * 8); 1499 } 1500 a->rlen[row]++; 1501 *(cp + i * 8) = col; 1502 *(vp + i * 8) = value; 1503 a->nz++; 1504 A->nonzerostate++; 1505 low = i + 1; 1506 high++; 1507 nrow++; 1508 noinsert:; 1509 } 1510 a->rlen[row] = nrow; 1511 } 1512 PetscFunctionReturn(0); 1513 } 1514 1515 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) { 1516 PetscFunctionBegin; 1517 /* If the two matrices have the same copy implementation, use fast copy. */ 1518 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1519 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1520 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1521 1522 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1523 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1524 } else { 1525 PetscCall(MatCopy_Basic(A, B, str)); 1526 } 1527 PetscFunctionReturn(0); 1528 } 1529 1530 PetscErrorCode MatSetUp_SeqSELL(Mat A) { 1531 PetscFunctionBegin; 1532 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1533 PetscFunctionReturn(0); 1534 } 1535 1536 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) { 1537 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1538 1539 PetscFunctionBegin; 1540 *array = a->val; 1541 PetscFunctionReturn(0); 1542 } 1543 1544 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) { 1545 PetscFunctionBegin; 1546 PetscFunctionReturn(0); 1547 } 1548 1549 PetscErrorCode MatRealPart_SeqSELL(Mat A) { 1550 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1551 PetscInt i; 1552 MatScalar *aval = a->val; 1553 1554 PetscFunctionBegin; 1555 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]); 1556 PetscFunctionReturn(0); 1557 } 1558 1559 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) { 1560 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1561 PetscInt i; 1562 MatScalar *aval = a->val; 1563 1564 PetscFunctionBegin; 1565 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1566 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1567 PetscFunctionReturn(0); 1568 } 1569 1570 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) { 1571 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1572 MatScalar *aval = a->val; 1573 PetscScalar oalpha = alpha; 1574 PetscBLASInt one = 1, size; 1575 1576 PetscFunctionBegin; 1577 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1578 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1579 PetscCall(PetscLogFlops(a->nz)); 1580 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1581 PetscFunctionReturn(0); 1582 } 1583 1584 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) { 1585 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1586 1587 PetscFunctionBegin; 1588 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1589 PetscCall(MatShift_Basic(Y, a)); 1590 PetscFunctionReturn(0); 1591 } 1592 1593 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) { 1594 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1595 PetscScalar *x, sum, *t; 1596 const MatScalar *idiag = NULL, *mdiag; 1597 const PetscScalar *b, *xb; 1598 PetscInt n, m = A->rmap->n, i, j, shift; 1599 const PetscInt *diag; 1600 1601 PetscFunctionBegin; 1602 its = its * lits; 1603 1604 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1605 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1606 a->fshift = fshift; 1607 a->omega = omega; 1608 1609 diag = a->diag; 1610 t = a->ssor_work; 1611 idiag = a->idiag; 1612 mdiag = a->mdiag; 1613 1614 PetscCall(VecGetArray(xx, &x)); 1615 PetscCall(VecGetArrayRead(bb, &b)); 1616 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1617 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1618 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1619 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1620 1621 if (flag & SOR_ZERO_INITIAL_GUESS) { 1622 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1623 for (i = 0; i < m; i++) { 1624 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1625 sum = b[i]; 1626 n = (diag[i] - shift) / 8; 1627 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1628 t[i] = sum; 1629 x[i] = sum * idiag[i]; 1630 } 1631 xb = t; 1632 PetscCall(PetscLogFlops(a->nz)); 1633 } else xb = b; 1634 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1635 for (i = m - 1; i >= 0; i--) { 1636 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1637 sum = xb[i]; 1638 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1639 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1640 if (xb == b) { 1641 x[i] = sum * idiag[i]; 1642 } else { 1643 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1644 } 1645 } 1646 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1647 } 1648 its--; 1649 } 1650 while (its--) { 1651 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1652 for (i = 0; i < m; i++) { 1653 /* lower */ 1654 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1655 sum = b[i]; 1656 n = (diag[i] - shift) / 8; 1657 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1658 t[i] = sum; /* save application of the lower-triangular part */ 1659 /* upper */ 1660 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1661 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1662 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1663 } 1664 xb = t; 1665 PetscCall(PetscLogFlops(2.0 * a->nz)); 1666 } else xb = b; 1667 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1668 for (i = m - 1; i >= 0; i--) { 1669 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1670 sum = xb[i]; 1671 if (xb == b) { 1672 /* whole matrix (no checkpointing available) */ 1673 n = a->rlen[i]; 1674 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1675 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1676 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1677 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1678 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1679 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1680 } 1681 } 1682 if (xb == b) { 1683 PetscCall(PetscLogFlops(2.0 * a->nz)); 1684 } else { 1685 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1686 } 1687 } 1688 } 1689 PetscCall(VecRestoreArray(xx, &x)); 1690 PetscCall(VecRestoreArrayRead(bb, &b)); 1691 PetscFunctionReturn(0); 1692 } 1693 1694 /* -------------------------------------------------------------------*/ 1695 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1696 MatGetRow_SeqSELL, 1697 MatRestoreRow_SeqSELL, 1698 MatMult_SeqSELL, 1699 /* 4*/ MatMultAdd_SeqSELL, 1700 MatMultTranspose_SeqSELL, 1701 MatMultTransposeAdd_SeqSELL, 1702 NULL, 1703 NULL, 1704 NULL, 1705 /* 10*/ NULL, 1706 NULL, 1707 NULL, 1708 MatSOR_SeqSELL, 1709 NULL, 1710 /* 15*/ MatGetInfo_SeqSELL, 1711 MatEqual_SeqSELL, 1712 MatGetDiagonal_SeqSELL, 1713 MatDiagonalScale_SeqSELL, 1714 NULL, 1715 /* 20*/ NULL, 1716 MatAssemblyEnd_SeqSELL, 1717 MatSetOption_SeqSELL, 1718 MatZeroEntries_SeqSELL, 1719 /* 24*/ NULL, 1720 NULL, 1721 NULL, 1722 NULL, 1723 NULL, 1724 /* 29*/ MatSetUp_SeqSELL, 1725 NULL, 1726 NULL, 1727 NULL, 1728 NULL, 1729 /* 34*/ MatDuplicate_SeqSELL, 1730 NULL, 1731 NULL, 1732 NULL, 1733 NULL, 1734 /* 39*/ NULL, 1735 NULL, 1736 NULL, 1737 MatGetValues_SeqSELL, 1738 MatCopy_SeqSELL, 1739 /* 44*/ NULL, 1740 MatScale_SeqSELL, 1741 MatShift_SeqSELL, 1742 NULL, 1743 NULL, 1744 /* 49*/ NULL, 1745 NULL, 1746 NULL, 1747 NULL, 1748 NULL, 1749 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1750 NULL, 1751 NULL, 1752 NULL, 1753 NULL, 1754 /* 59*/ NULL, 1755 MatDestroy_SeqSELL, 1756 MatView_SeqSELL, 1757 NULL, 1758 NULL, 1759 /* 64*/ NULL, 1760 NULL, 1761 NULL, 1762 NULL, 1763 NULL, 1764 /* 69*/ NULL, 1765 NULL, 1766 NULL, 1767 NULL, 1768 NULL, 1769 /* 74*/ NULL, 1770 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1771 NULL, 1772 NULL, 1773 NULL, 1774 /* 79*/ NULL, 1775 NULL, 1776 NULL, 1777 NULL, 1778 NULL, 1779 /* 84*/ NULL, 1780 NULL, 1781 NULL, 1782 NULL, 1783 NULL, 1784 /* 89*/ NULL, 1785 NULL, 1786 NULL, 1787 NULL, 1788 NULL, 1789 /* 94*/ NULL, 1790 NULL, 1791 NULL, 1792 NULL, 1793 NULL, 1794 /* 99*/ NULL, 1795 NULL, 1796 NULL, 1797 MatConjugate_SeqSELL, 1798 NULL, 1799 /*104*/ NULL, 1800 NULL, 1801 NULL, 1802 NULL, 1803 NULL, 1804 /*109*/ NULL, 1805 NULL, 1806 NULL, 1807 NULL, 1808 MatMissingDiagonal_SeqSELL, 1809 /*114*/ NULL, 1810 NULL, 1811 NULL, 1812 NULL, 1813 NULL, 1814 /*119*/ NULL, 1815 NULL, 1816 NULL, 1817 NULL, 1818 NULL, 1819 /*124*/ NULL, 1820 NULL, 1821 NULL, 1822 NULL, 1823 NULL, 1824 /*129*/ NULL, 1825 NULL, 1826 NULL, 1827 NULL, 1828 NULL, 1829 /*134*/ NULL, 1830 NULL, 1831 NULL, 1832 NULL, 1833 NULL, 1834 /*139*/ NULL, 1835 NULL, 1836 NULL, 1837 MatFDColoringSetUp_SeqXAIJ, 1838 NULL, 1839 /*144*/ NULL, 1840 NULL, 1841 NULL, 1842 NULL, 1843 NULL, 1844 NULL, 1845 /*150*/ NULL}; 1846 1847 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) { 1848 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1849 1850 PetscFunctionBegin; 1851 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1852 1853 /* allocate space for values if not already there */ 1854 if (!a->saved_values) { 1855 PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1856 PetscCall(PetscLogObjectMemory((PetscObject)mat, (a->sliidx[a->totalslices] + 1) * sizeof(PetscScalar))); 1857 } 1858 1859 /* copy values over */ 1860 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1861 PetscFunctionReturn(0); 1862 } 1863 1864 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) { 1865 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1866 1867 PetscFunctionBegin; 1868 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1869 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1870 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1871 PetscFunctionReturn(0); 1872 } 1873 1874 /*@C 1875 MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()` 1876 1877 Not Collective 1878 1879 Input Parameters: 1880 . mat - a `MATSEQSELL` matrix 1881 . array - pointer to the data 1882 1883 Level: intermediate 1884 1885 .seealso: `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 1886 @*/ 1887 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) { 1888 PetscFunctionBegin; 1889 PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array)); 1890 PetscFunctionReturn(0); 1891 } 1892 1893 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) { 1894 Mat_SeqSELL *b; 1895 PetscMPIInt size; 1896 1897 PetscFunctionBegin; 1898 PetscCall(PetscCitationsRegister(citation, &cited)); 1899 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 1900 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 1901 1902 PetscCall(PetscNewLog(B, &b)); 1903 1904 B->data = (void *)b; 1905 1906 PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps))); 1907 1908 b->row = NULL; 1909 b->col = NULL; 1910 b->icol = NULL; 1911 b->reallocs = 0; 1912 b->ignorezeroentries = PETSC_FALSE; 1913 b->roworiented = PETSC_TRUE; 1914 b->nonew = 0; 1915 b->diag = NULL; 1916 b->solve_work = NULL; 1917 B->spptr = NULL; 1918 b->saved_values = NULL; 1919 b->idiag = NULL; 1920 b->mdiag = NULL; 1921 b->ssor_work = NULL; 1922 b->omega = 1.0; 1923 b->fshift = 0.0; 1924 b->idiagvalid = PETSC_FALSE; 1925 b->keepnonzeropattern = PETSC_FALSE; 1926 1927 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 1928 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 1929 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 1930 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 1931 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 1932 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 1933 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 1934 PetscFunctionReturn(0); 1935 } 1936 1937 /* 1938 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1939 */ 1940 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) { 1941 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 1942 PetscInt i, m = A->rmap->n; 1943 PetscInt totalslices = a->totalslices; 1944 1945 PetscFunctionBegin; 1946 C->factortype = A->factortype; 1947 c->row = NULL; 1948 c->col = NULL; 1949 c->icol = NULL; 1950 c->reallocs = 0; 1951 C->assembled = PETSC_TRUE; 1952 1953 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 1954 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 1955 1956 PetscCall(PetscMalloc1(8 * totalslices, &c->rlen)); 1957 PetscCall(PetscLogObjectMemory((PetscObject)C, m * sizeof(PetscInt))); 1958 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 1959 PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices + 1) * sizeof(PetscInt))); 1960 1961 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 1962 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 1963 1964 /* allocate the matrix space */ 1965 if (mallocmatspace) { 1966 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 1967 PetscCall(PetscLogObjectMemory((PetscObject)C, a->maxallocmat * (sizeof(PetscScalar) + sizeof(PetscInt)))); 1968 1969 c->singlemalloc = PETSC_TRUE; 1970 1971 if (m > 0) { 1972 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 1973 if (cpvalues == MAT_COPY_VALUES) { 1974 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 1975 } else { 1976 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 1977 } 1978 } 1979 } 1980 1981 c->ignorezeroentries = a->ignorezeroentries; 1982 c->roworiented = a->roworiented; 1983 c->nonew = a->nonew; 1984 if (a->diag) { 1985 PetscCall(PetscMalloc1(m, &c->diag)); 1986 PetscCall(PetscLogObjectMemory((PetscObject)C, m * sizeof(PetscInt))); 1987 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 1988 } else c->diag = NULL; 1989 1990 c->solve_work = NULL; 1991 c->saved_values = NULL; 1992 c->idiag = NULL; 1993 c->ssor_work = NULL; 1994 c->keepnonzeropattern = a->keepnonzeropattern; 1995 c->free_val = PETSC_TRUE; 1996 c->free_colidx = PETSC_TRUE; 1997 1998 c->maxallocmat = a->maxallocmat; 1999 c->maxallocrow = a->maxallocrow; 2000 c->rlenmax = a->rlenmax; 2001 c->nz = a->nz; 2002 C->preallocated = PETSC_TRUE; 2003 2004 c->nonzerorowcnt = a->nonzerorowcnt; 2005 C->nonzerostate = A->nonzerostate; 2006 2007 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2008 PetscFunctionReturn(0); 2009 } 2010 2011 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) { 2012 PetscFunctionBegin; 2013 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2014 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2015 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2016 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2017 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2018 PetscFunctionReturn(0); 2019 } 2020 2021 /*MC 2022 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2023 based on the sliced Ellpack format 2024 2025 Options Database Keys: 2026 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2027 2028 Level: beginner 2029 2030 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2031 M*/ 2032 2033 /*MC 2034 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2035 2036 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2037 and `MATMPISELL` otherwise. As a result, for single process communicators, 2038 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2039 for communicators controlling multiple processes. It is recommended that you call both of 2040 the above preallocation routines for simplicity. 2041 2042 Options Database Keys: 2043 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2044 2045 Level: beginner 2046 2047 Notes: 2048 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2049 2050 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2051 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2052 2053 Developer Notes: 2054 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2055 2056 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2057 .vb 2058 (2 0 3 4) 2059 Consider the matrix A = (5 0 6 0) 2060 (0 0 7 8) 2061 (0 0 9 9) 2062 2063 symbolically the Ellpack format can be written as 2064 2065 (2 3 4 |) (0 2 3 |) 2066 v = (5 6 0 |) colidx = (0 2 2 |) 2067 -------- --------- 2068 (7 8 |) (2 3 |) 2069 (9 9 |) (2 3 |) 2070 2071 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2072 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2073 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2074 2075 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2076 2077 .ve 2078 2079 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2080 2081 References: 2082 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2083 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2084 2085 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2086 M*/ 2087 2088 /*@C 2089 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2090 2091 Collective on comm 2092 2093 Input Parameters: 2094 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2095 . m - number of rows 2096 . n - number of columns 2097 . rlenmax - maximum number of nonzeros in a row 2098 - rlen - array containing the number of nonzeros in the various rows 2099 (possibly different for each row) or NULL 2100 2101 Output Parameter: 2102 . A - the matrix 2103 2104 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2105 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2106 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2107 2108 Notes: 2109 If nnz is given then nz is ignored 2110 2111 Specify the preallocated storage with either rlenmax or rlen (not both). 2112 Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory 2113 allocation. For large problems you MUST preallocate memory or you 2114 will get TERRIBLE performance, see the users' manual chapter on matrices. 2115 2116 Level: intermediate 2117 2118 .seealso: `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2119 @*/ 2120 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A) { 2121 PetscFunctionBegin; 2122 PetscCall(MatCreate(comm, A)); 2123 PetscCall(MatSetSizes(*A, m, n, m, n)); 2124 PetscCall(MatSetType(*A, MATSEQSELL)); 2125 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen)); 2126 PetscFunctionReturn(0); 2127 } 2128 2129 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) { 2130 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2131 PetscInt totalslices = a->totalslices; 2132 2133 PetscFunctionBegin; 2134 /* If the matrix dimensions are not equal,or no of nonzeros */ 2135 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2136 *flg = PETSC_FALSE; 2137 PetscFunctionReturn(0); 2138 } 2139 /* if the a->colidx are the same */ 2140 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2141 if (!*flg) PetscFunctionReturn(0); 2142 /* if a->val are the same */ 2143 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2144 PetscFunctionReturn(0); 2145 } 2146 2147 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) { 2148 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2149 2150 PetscFunctionBegin; 2151 a->idiagvalid = PETSC_FALSE; 2152 PetscFunctionReturn(0); 2153 } 2154 2155 PetscErrorCode MatConjugate_SeqSELL(Mat A) { 2156 #if defined(PETSC_USE_COMPLEX) 2157 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2158 PetscInt i; 2159 PetscScalar *val = a->val; 2160 2161 PetscFunctionBegin; 2162 for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]); 2163 #else 2164 PetscFunctionBegin; 2165 #endif 2166 PetscFunctionReturn(0); 2167 } 2168