1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 11 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 12 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 13 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 14 " year = 2018\n" 15 "}\n"; 16 17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 18 19 #include <immintrin.h> 20 21 #if !defined(_MM_SCALE_8) 22 #define _MM_SCALE_8 8 23 #endif 24 25 #if defined(__AVX512F__) 26 /* these do not work 27 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 28 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 29 */ 30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 31 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 32 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 33 vec_vals = _mm512_loadu_pd(aval); \ 34 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 35 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 36 #elif defined(__AVX2__) && defined(__FMA__) 37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 38 vec_vals = _mm256_loadu_pd(aval); \ 39 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 40 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 41 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 42 #endif 43 #endif /* PETSC_HAVE_IMMINTRIN_H */ 44 45 /*@C 46 MatSeqSELLSetPreallocation - For good matrix assembly performance 47 the user should preallocate the matrix storage by setting the parameter nz 48 (or the array nnz). By setting these parameters accurately, performance 49 during matrix assembly can be increased significantly. 50 51 Collective 52 53 Input Parameters: 54 + B - The `MATSEQSELL` matrix 55 . nz - number of nonzeros per row (same for all rows) 56 - nnz - array containing the number of nonzeros in the various rows 57 (possibly different for each row) or NULL 58 59 Notes: 60 If nnz is given then nz is ignored. 61 62 Specify the preallocated storage with either nz or nnz (not both). 63 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 64 allocation. For large problems you MUST preallocate memory or you 65 will get TERRIBLE performance, see the users' manual chapter on matrices. 66 67 You can call `MatGetInfo()` to get information on how effective the preallocation was; 68 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 69 You can also run with the option -info and look for messages with the string 70 malloc in them to see if additional memory allocation was needed. 71 72 Developers: Use nz of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 73 entries or columns indices. 74 75 The maximum number of nonzeos in any row should be as accurate as possible. 76 If it is underestimated, you will get bad performance due to reallocation 77 (MatSeqXSELLReallocateSELL). 78 79 Level: intermediate 80 81 .seealso: `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 82 @*/ 83 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 84 { 85 PetscFunctionBegin; 86 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 87 PetscValidType(B, 1); 88 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 89 PetscFunctionReturn(0); 90 } 91 92 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 93 { 94 Mat_SeqSELL *b; 95 PetscInt i, j, totalslices; 96 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 97 98 PetscFunctionBegin; 99 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 100 if (maxallocrow == MAT_SKIP_ALLOCATION) { 101 skipallocation = PETSC_TRUE; 102 maxallocrow = 0; 103 } 104 105 PetscCall(PetscLayoutSetUp(B->rmap)); 106 PetscCall(PetscLayoutSetUp(B->cmap)); 107 108 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 109 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 110 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 111 if (rlen) { 112 for (i = 0; i < B->rmap->n; i++) { 113 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 114 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 115 } 116 } 117 118 B->preallocated = PETSC_TRUE; 119 120 b = (Mat_SeqSELL *)B->data; 121 122 totalslices = PetscCeilInt(B->rmap->n, 8); 123 b->totalslices = totalslices; 124 if (!skipallocation) { 125 if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n)); 126 127 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 128 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 129 } 130 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 131 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 132 else if (maxallocrow < 0) maxallocrow = 1; 133 for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow; 134 } else { 135 maxallocrow = 0; 136 b->sliidx[0] = 0; 137 for (i = 1; i < totalslices; i++) { 138 b->sliidx[i] = 0; 139 for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]); 140 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 141 PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i])); 142 } 143 /* last slice */ 144 b->sliidx[totalslices] = 0; 145 for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 146 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 147 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices]; 148 } 149 150 /* allocate space for val, colidx, rlen */ 151 /* FIXME: should B's old memory be unlogged? */ 152 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 153 /* FIXME: assuming an element of the bit array takes 8 bits */ 154 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 155 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 156 PetscCall(PetscCalloc1(8 * totalslices, &b->rlen)); 157 158 b->singlemalloc = PETSC_TRUE; 159 b->free_val = PETSC_TRUE; 160 b->free_colidx = PETSC_TRUE; 161 } else { 162 b->free_val = PETSC_FALSE; 163 b->free_colidx = PETSC_FALSE; 164 } 165 166 b->nz = 0; 167 b->maxallocrow = maxallocrow; 168 b->rlenmax = maxallocrow; 169 b->maxallocmat = b->sliidx[totalslices]; 170 B->info.nz_unneeded = (double)b->maxallocmat; 171 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 172 PetscFunctionReturn(0); 173 } 174 175 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 176 { 177 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 178 PetscInt shift; 179 180 PetscFunctionBegin; 181 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 182 if (nz) *nz = a->rlen[row]; 183 shift = a->sliidx[row >> 3] + (row & 0x07); 184 if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); 185 if (idx) { 186 PetscInt j; 187 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j]; 188 *idx = a->getrowcols; 189 } 190 if (v) { 191 PetscInt j; 192 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j]; 193 *v = a->getrowvals; 194 } 195 PetscFunctionReturn(0); 196 } 197 198 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 199 { 200 PetscFunctionBegin; 201 PetscFunctionReturn(0); 202 } 203 204 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 205 { 206 Mat B; 207 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 208 PetscInt i; 209 210 PetscFunctionBegin; 211 if (reuse == MAT_REUSE_MATRIX) { 212 B = *newmat; 213 PetscCall(MatZeroEntries(B)); 214 } else { 215 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 216 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 217 PetscCall(MatSetType(B, MATSEQAIJ)); 218 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 219 } 220 221 for (i = 0; i < A->rmap->n; i++) { 222 PetscInt nz = 0, *cols = NULL; 223 PetscScalar *vals = NULL; 224 225 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 226 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 227 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 228 } 229 230 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 231 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 232 B->rmap->bs = A->rmap->bs; 233 234 if (reuse == MAT_INPLACE_MATRIX) { 235 PetscCall(MatHeaderReplace(A, &B)); 236 } else { 237 *newmat = B; 238 } 239 PetscFunctionReturn(0); 240 } 241 242 #include <../src/mat/impls/aij/seq/aij.h> 243 244 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 245 { 246 Mat B; 247 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 248 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 249 const PetscInt *cols; 250 const PetscScalar *vals; 251 252 PetscFunctionBegin; 253 254 if (reuse == MAT_REUSE_MATRIX) { 255 B = *newmat; 256 } else { 257 if (PetscDefined(USE_DEBUG) || !a->ilen) { 258 PetscCall(PetscMalloc1(m, &rowlengths)); 259 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 260 } 261 if (PetscDefined(USE_DEBUG) && a->ilen) { 262 PetscBool eq; 263 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 264 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 265 PetscCall(PetscFree(rowlengths)); 266 rowlengths = a->ilen; 267 } else if (a->ilen) rowlengths = a->ilen; 268 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 269 PetscCall(MatSetSizes(B, m, n, m, n)); 270 PetscCall(MatSetType(B, MATSEQSELL)); 271 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 272 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 273 } 274 275 for (row = 0; row < m; row++) { 276 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 277 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 278 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 279 } 280 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 281 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 282 B->rmap->bs = A->rmap->bs; 283 284 if (reuse == MAT_INPLACE_MATRIX) { 285 PetscCall(MatHeaderReplace(A, &B)); 286 } else { 287 *newmat = B; 288 } 289 PetscFunctionReturn(0); 290 } 291 292 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 293 { 294 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 295 PetscScalar *y; 296 const PetscScalar *x; 297 const MatScalar *aval = a->val; 298 PetscInt totalslices = a->totalslices; 299 const PetscInt *acolidx = a->colidx; 300 PetscInt i, j; 301 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 302 __m512d vec_x, vec_y, vec_vals; 303 __m256i vec_idx; 304 __mmask8 mask; 305 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 306 __m256i vec_idx2, vec_idx3, vec_idx4; 307 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 308 __m128i vec_idx; 309 __m256d vec_x, vec_y, vec_y2, vec_vals; 310 MatScalar yval; 311 PetscInt r, rows_left, row, nnz_in_row; 312 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 313 __m128d vec_x_tmp; 314 __m256d vec_x, vec_y, vec_y2, vec_vals; 315 MatScalar yval; 316 PetscInt r, rows_left, row, nnz_in_row; 317 #else 318 PetscScalar sum[8]; 319 #endif 320 321 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 322 #pragma disjoint(*x, *y, *aval) 323 #endif 324 325 PetscFunctionBegin; 326 PetscCall(VecGetArrayRead(xx, &x)); 327 PetscCall(VecGetArray(yy, &y)); 328 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 329 for (i = 0; i < totalslices; i++) { /* loop over slices */ 330 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 331 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 332 333 vec_y = _mm512_setzero_pd(); 334 vec_y2 = _mm512_setzero_pd(); 335 vec_y3 = _mm512_setzero_pd(); 336 vec_y4 = _mm512_setzero_pd(); 337 338 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 339 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 340 case 3: 341 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 342 acolidx += 8; 343 aval += 8; 344 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 345 acolidx += 8; 346 aval += 8; 347 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 348 acolidx += 8; 349 aval += 8; 350 j += 3; 351 break; 352 case 2: 353 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 354 acolidx += 8; 355 aval += 8; 356 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 357 acolidx += 8; 358 aval += 8; 359 j += 2; 360 break; 361 case 1: 362 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 363 acolidx += 8; 364 aval += 8; 365 j += 1; 366 break; 367 } 368 #pragma novector 369 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 370 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 371 acolidx += 8; 372 aval += 8; 373 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 374 acolidx += 8; 375 aval += 8; 376 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 377 acolidx += 8; 378 aval += 8; 379 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 380 acolidx += 8; 381 aval += 8; 382 } 383 384 vec_y = _mm512_add_pd(vec_y, vec_y2); 385 vec_y = _mm512_add_pd(vec_y, vec_y3); 386 vec_y = _mm512_add_pd(vec_y, vec_y4); 387 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 388 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 389 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 390 } else { 391 _mm512_storeu_pd(&y[8 * i], vec_y); 392 } 393 } 394 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 395 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 396 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 397 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 398 399 /* last slice may have padding rows. Don't use vectorization. */ 400 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 401 rows_left = A->rmap->n - 8 * i; 402 for (r = 0; r < rows_left; ++r) { 403 yval = (MatScalar)0; 404 row = 8 * i + r; 405 nnz_in_row = a->rlen[row]; 406 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 407 y[row] = yval; 408 } 409 break; 410 } 411 412 vec_y = _mm256_setzero_pd(); 413 vec_y2 = _mm256_setzero_pd(); 414 415 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 416 #pragma novector 417 #pragma unroll(2) 418 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 419 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 420 aval += 4; 421 acolidx += 4; 422 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 423 aval += 4; 424 acolidx += 4; 425 } 426 427 _mm256_storeu_pd(y + i * 8, vec_y); 428 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 429 } 430 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 431 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 432 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 433 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 434 435 vec_y = _mm256_setzero_pd(); 436 vec_y2 = _mm256_setzero_pd(); 437 438 /* last slice may have padding rows. Don't use vectorization. */ 439 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 440 rows_left = A->rmap->n - 8 * i; 441 for (r = 0; r < rows_left; ++r) { 442 yval = (MatScalar)0; 443 row = 8 * i + r; 444 nnz_in_row = a->rlen[row]; 445 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 446 y[row] = yval; 447 } 448 break; 449 } 450 451 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 452 #pragma novector 453 #pragma unroll(2) 454 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 455 vec_vals = _mm256_loadu_pd(aval); 456 vec_x_tmp = _mm_setzero_pd(); 457 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 458 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 459 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 460 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 461 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 462 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 463 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 464 aval += 4; 465 466 vec_vals = _mm256_loadu_pd(aval); 467 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 468 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 469 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 470 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 471 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 472 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 473 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 474 aval += 4; 475 } 476 477 _mm256_storeu_pd(y + i * 8, vec_y); 478 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 479 } 480 #else 481 for (i = 0; i < totalslices; i++) { /* loop over slices */ 482 for (j = 0; j < 8; j++) sum[j] = 0.0; 483 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 484 sum[0] += aval[j] * x[acolidx[j]]; 485 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 486 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 487 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 488 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 489 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 490 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 491 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 492 } 493 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 494 for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j]; 495 } else { 496 for (j = 0; j < 8; j++) y[8 * i + j] = sum[j]; 497 } 498 } 499 #endif 500 501 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 502 PetscCall(VecRestoreArrayRead(xx, &x)); 503 PetscCall(VecRestoreArray(yy, &y)); 504 PetscFunctionReturn(0); 505 } 506 507 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 508 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 509 { 510 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 511 PetscScalar *y, *z; 512 const PetscScalar *x; 513 const MatScalar *aval = a->val; 514 PetscInt totalslices = a->totalslices; 515 const PetscInt *acolidx = a->colidx; 516 PetscInt i, j; 517 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 518 __m512d vec_x, vec_y, vec_vals; 519 __m256i vec_idx; 520 __mmask8 mask; 521 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 522 __m256i vec_idx2, vec_idx3, vec_idx4; 523 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 524 __m128d vec_x_tmp; 525 __m256d vec_x, vec_y, vec_y2, vec_vals; 526 MatScalar yval; 527 PetscInt r, row, nnz_in_row; 528 #else 529 PetscScalar sum[8]; 530 #endif 531 532 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 533 #pragma disjoint(*x, *y, *aval) 534 #endif 535 536 PetscFunctionBegin; 537 PetscCall(VecGetArrayRead(xx, &x)); 538 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 539 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 540 for (i = 0; i < totalslices; i++) { /* loop over slices */ 541 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 542 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 543 544 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 545 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 546 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 547 } else { 548 vec_y = _mm512_loadu_pd(&y[8 * i]); 549 } 550 vec_y2 = _mm512_setzero_pd(); 551 vec_y3 = _mm512_setzero_pd(); 552 vec_y4 = _mm512_setzero_pd(); 553 554 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 555 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 556 case 3: 557 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 558 acolidx += 8; 559 aval += 8; 560 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 561 acolidx += 8; 562 aval += 8; 563 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 564 acolidx += 8; 565 aval += 8; 566 j += 3; 567 break; 568 case 2: 569 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 570 acolidx += 8; 571 aval += 8; 572 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 573 acolidx += 8; 574 aval += 8; 575 j += 2; 576 break; 577 case 1: 578 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 579 acolidx += 8; 580 aval += 8; 581 j += 1; 582 break; 583 } 584 #pragma novector 585 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 586 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 587 acolidx += 8; 588 aval += 8; 589 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 590 acolidx += 8; 591 aval += 8; 592 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 593 acolidx += 8; 594 aval += 8; 595 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 596 acolidx += 8; 597 aval += 8; 598 } 599 600 vec_y = _mm512_add_pd(vec_y, vec_y2); 601 vec_y = _mm512_add_pd(vec_y, vec_y3); 602 vec_y = _mm512_add_pd(vec_y, vec_y4); 603 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 604 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 605 } else { 606 _mm512_storeu_pd(&z[8 * i], vec_y); 607 } 608 } 609 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 610 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 611 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 612 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 613 614 /* last slice may have padding rows. Don't use vectorization. */ 615 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 616 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 617 row = 8 * i + r; 618 yval = (MatScalar)0.0; 619 nnz_in_row = a->rlen[row]; 620 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 621 z[row] = y[row] + yval; 622 } 623 break; 624 } 625 626 vec_y = _mm256_loadu_pd(y + 8 * i); 627 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 628 629 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 630 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 631 vec_vals = _mm256_loadu_pd(aval); 632 vec_x_tmp = _mm_setzero_pd(); 633 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 634 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 635 vec_x = _mm256_setzero_pd(); 636 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 637 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 638 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 639 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 640 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 641 aval += 4; 642 643 vec_vals = _mm256_loadu_pd(aval); 644 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 645 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 646 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 647 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 648 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 649 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 650 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 651 aval += 4; 652 } 653 654 _mm256_storeu_pd(z + i * 8, vec_y); 655 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 656 } 657 #else 658 for (i = 0; i < totalslices; i++) { /* loop over slices */ 659 for (j = 0; j < 8; j++) sum[j] = 0.0; 660 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 661 sum[0] += aval[j] * x[acolidx[j]]; 662 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 663 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 664 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 665 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 666 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 667 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 668 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 669 } 670 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 671 for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 672 } else { 673 for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 674 } 675 } 676 #endif 677 678 PetscCall(PetscLogFlops(2.0 * a->nz)); 679 PetscCall(VecRestoreArrayRead(xx, &x)); 680 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 681 PetscFunctionReturn(0); 682 } 683 684 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 685 { 686 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 687 PetscScalar *y; 688 const PetscScalar *x; 689 const MatScalar *aval = a->val; 690 const PetscInt *acolidx = a->colidx; 691 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices; 692 693 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 694 #pragma disjoint(*x, *y, *aval) 695 #endif 696 697 PetscFunctionBegin; 698 if (A->symmetric == PETSC_BOOL3_TRUE) { 699 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 700 PetscFunctionReturn(0); 701 } 702 if (zz != yy) PetscCall(VecCopy(zz, yy)); 703 PetscCall(VecGetArrayRead(xx, &x)); 704 PetscCall(VecGetArray(yy, &y)); 705 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 706 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 707 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 708 row = 8 * i + r; 709 nnz_in_row = a->rlen[row]; 710 for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row]; 711 } 712 break; 713 } 714 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 715 y[acolidx[j]] += aval[j] * x[8 * i]; 716 y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1]; 717 y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2]; 718 y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3]; 719 y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4]; 720 y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5]; 721 y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6]; 722 y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7]; 723 } 724 } 725 PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices])); 726 PetscCall(VecRestoreArrayRead(xx, &x)); 727 PetscCall(VecRestoreArray(yy, &y)); 728 PetscFunctionReturn(0); 729 } 730 731 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 732 { 733 PetscFunctionBegin; 734 if (A->symmetric == PETSC_BOOL3_TRUE) { 735 PetscCall(MatMult_SeqSELL(A, xx, yy)); 736 } else { 737 PetscCall(VecSet(yy, 0.0)); 738 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 739 } 740 PetscFunctionReturn(0); 741 } 742 743 /* 744 Checks for missing diagonals 745 */ 746 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 747 { 748 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 749 PetscInt *diag, i; 750 751 PetscFunctionBegin; 752 *missing = PETSC_FALSE; 753 if (A->rmap->n > 0 && !(a->colidx)) { 754 *missing = PETSC_TRUE; 755 if (d) *d = 0; 756 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 757 } else { 758 diag = a->diag; 759 for (i = 0; i < A->rmap->n; i++) { 760 if (diag[i] == -1) { 761 *missing = PETSC_TRUE; 762 if (d) *d = i; 763 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 764 break; 765 } 766 } 767 } 768 PetscFunctionReturn(0); 769 } 770 771 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 772 { 773 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 774 PetscInt i, j, m = A->rmap->n, shift; 775 776 PetscFunctionBegin; 777 if (!a->diag) { 778 PetscCall(PetscMalloc1(m, &a->diag)); 779 a->free_diag = PETSC_TRUE; 780 } 781 for (i = 0; i < m; i++) { /* loop over rows */ 782 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 783 a->diag[i] = -1; 784 for (j = 0; j < a->rlen[i]; j++) { 785 if (a->colidx[shift + j * 8] == i) { 786 a->diag[i] = shift + j * 8; 787 break; 788 } 789 } 790 } 791 PetscFunctionReturn(0); 792 } 793 794 /* 795 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 796 */ 797 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 798 { 799 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 800 PetscInt i, *diag, m = A->rmap->n; 801 MatScalar *val = a->val; 802 PetscScalar *idiag, *mdiag; 803 804 PetscFunctionBegin; 805 if (a->idiagvalid) PetscFunctionReturn(0); 806 PetscCall(MatMarkDiagonal_SeqSELL(A)); 807 diag = a->diag; 808 if (!a->idiag) { 809 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 810 val = a->val; 811 } 812 mdiag = a->mdiag; 813 idiag = a->idiag; 814 815 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 816 for (i = 0; i < m; i++) { 817 mdiag[i] = val[diag[i]]; 818 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 819 if (PetscRealPart(fshift)) { 820 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 821 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 822 A->factorerror_zeropivot_value = 0.0; 823 A->factorerror_zeropivot_row = i; 824 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 825 } 826 idiag[i] = 1.0 / val[diag[i]]; 827 } 828 PetscCall(PetscLogFlops(m)); 829 } else { 830 for (i = 0; i < m; i++) { 831 mdiag[i] = val[diag[i]]; 832 idiag[i] = omega / (fshift + val[diag[i]]); 833 } 834 PetscCall(PetscLogFlops(2.0 * m)); 835 } 836 a->idiagvalid = PETSC_TRUE; 837 PetscFunctionReturn(0); 838 } 839 840 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 841 { 842 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 843 844 PetscFunctionBegin; 845 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 846 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 847 PetscFunctionReturn(0); 848 } 849 850 PetscErrorCode MatDestroy_SeqSELL(Mat A) 851 { 852 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 853 854 PetscFunctionBegin; 855 #if defined(PETSC_USE_LOG) 856 PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz); 857 #endif 858 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 859 PetscCall(ISDestroy(&a->row)); 860 PetscCall(ISDestroy(&a->col)); 861 PetscCall(PetscFree(a->diag)); 862 PetscCall(PetscFree(a->rlen)); 863 PetscCall(PetscFree(a->sliidx)); 864 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 865 PetscCall(PetscFree(a->solve_work)); 866 PetscCall(ISDestroy(&a->icol)); 867 PetscCall(PetscFree(a->saved_values)); 868 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 869 870 PetscCall(PetscFree(A->data)); 871 872 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 873 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 874 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 875 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 876 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 877 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 878 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 879 PetscFunctionReturn(0); 880 } 881 882 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 883 { 884 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 885 886 PetscFunctionBegin; 887 switch (op) { 888 case MAT_ROW_ORIENTED: 889 a->roworiented = flg; 890 break; 891 case MAT_KEEP_NONZERO_PATTERN: 892 a->keepnonzeropattern = flg; 893 break; 894 case MAT_NEW_NONZERO_LOCATIONS: 895 a->nonew = (flg ? 0 : 1); 896 break; 897 case MAT_NEW_NONZERO_LOCATION_ERR: 898 a->nonew = (flg ? -1 : 0); 899 break; 900 case MAT_NEW_NONZERO_ALLOCATION_ERR: 901 a->nonew = (flg ? -2 : 0); 902 break; 903 case MAT_UNUSED_NONZERO_LOCATION_ERR: 904 a->nounused = (flg ? -1 : 0); 905 break; 906 case MAT_FORCE_DIAGONAL_ENTRIES: 907 case MAT_IGNORE_OFF_PROC_ENTRIES: 908 case MAT_USE_HASH_TABLE: 909 case MAT_SORTED_FULL: 910 PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); 911 break; 912 case MAT_SPD: 913 case MAT_SYMMETRIC: 914 case MAT_STRUCTURALLY_SYMMETRIC: 915 case MAT_HERMITIAN: 916 case MAT_SYMMETRY_ETERNAL: 917 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 918 case MAT_SPD_ETERNAL: 919 /* These options are handled directly by MatSetOption() */ 920 break; 921 default: 922 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 923 } 924 PetscFunctionReturn(0); 925 } 926 927 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 928 { 929 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 930 PetscInt i, j, n, shift; 931 PetscScalar *x, zero = 0.0; 932 933 PetscFunctionBegin; 934 PetscCall(VecGetLocalSize(v, &n)); 935 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 936 937 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 938 PetscInt *diag = a->diag; 939 PetscCall(VecGetArray(v, &x)); 940 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 941 PetscCall(VecRestoreArray(v, &x)); 942 PetscFunctionReturn(0); 943 } 944 945 PetscCall(VecSet(v, zero)); 946 PetscCall(VecGetArray(v, &x)); 947 for (i = 0; i < n; i++) { /* loop over rows */ 948 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 949 x[i] = 0; 950 for (j = 0; j < a->rlen[i]; j++) { 951 if (a->colidx[shift + j * 8] == i) { 952 x[i] = a->val[shift + j * 8]; 953 break; 954 } 955 } 956 } 957 PetscCall(VecRestoreArray(v, &x)); 958 PetscFunctionReturn(0); 959 } 960 961 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 962 { 963 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 964 const PetscScalar *l, *r; 965 PetscInt i, j, m, n, row; 966 967 PetscFunctionBegin; 968 if (ll) { 969 /* The local size is used so that VecMPI can be passed to this routine 970 by MatDiagonalScale_MPISELL */ 971 PetscCall(VecGetLocalSize(ll, &m)); 972 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 973 PetscCall(VecGetArrayRead(ll, &l)); 974 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 975 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 976 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 977 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row]; 978 } 979 } else { 980 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row]; 981 } 982 } 983 PetscCall(VecRestoreArrayRead(ll, &l)); 984 PetscCall(PetscLogFlops(a->nz)); 985 } 986 if (rr) { 987 PetscCall(VecGetLocalSize(rr, &n)); 988 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 989 PetscCall(VecGetArrayRead(rr, &r)); 990 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 991 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 992 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 993 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 994 } 995 } else { 996 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 997 } 998 } 999 PetscCall(VecRestoreArrayRead(rr, &r)); 1000 PetscCall(PetscLogFlops(a->nz)); 1001 } 1002 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1003 PetscFunctionReturn(0); 1004 } 1005 1006 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1007 { 1008 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1009 PetscInt *cp, i, k, low, high, t, row, col, l; 1010 PetscInt shift; 1011 MatScalar *vp; 1012 1013 PetscFunctionBegin; 1014 for (k = 0; k < m; k++) { /* loop over requested rows */ 1015 row = im[k]; 1016 if (row < 0) continue; 1017 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1018 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 1019 cp = a->colidx + shift; /* pointer to the row */ 1020 vp = a->val + shift; /* pointer to the row */ 1021 for (l = 0; l < n; l++) { /* loop over requested columns */ 1022 col = in[l]; 1023 if (col < 0) continue; 1024 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1025 high = a->rlen[row]; 1026 low = 0; /* assume unsorted */ 1027 while (high - low > 5) { 1028 t = (low + high) / 2; 1029 if (*(cp + t * 8) > col) high = t; 1030 else low = t; 1031 } 1032 for (i = low; i < high; i++) { 1033 if (*(cp + 8 * i) > col) break; 1034 if (*(cp + 8 * i) == col) { 1035 *v++ = *(vp + 8 * i); 1036 goto finished; 1037 } 1038 } 1039 *v++ = 0.0; 1040 finished:; 1041 } 1042 } 1043 PetscFunctionReturn(0); 1044 } 1045 1046 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1047 { 1048 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1049 PetscInt i, j, m = A->rmap->n, shift; 1050 const char *name; 1051 PetscViewerFormat format; 1052 1053 PetscFunctionBegin; 1054 PetscCall(PetscViewerGetFormat(viewer, &format)); 1055 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1056 PetscInt nofinalvalue = 0; 1057 /* 1058 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1059 nofinalvalue = 1; 1060 } 1061 */ 1062 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1063 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1064 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1065 #if defined(PETSC_USE_COMPLEX) 1066 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1067 #else 1068 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1069 #endif 1070 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1071 1072 for (i = 0; i < m; i++) { 1073 shift = a->sliidx[i >> 3] + (i & 0x07); 1074 for (j = 0; j < a->rlen[i]; j++) { 1075 #if defined(PETSC_USE_COMPLEX) 1076 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1077 #else 1078 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j])); 1079 #endif 1080 } 1081 } 1082 /* 1083 if (nofinalvalue) { 1084 #if defined(PETSC_USE_COMPLEX) 1085 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1086 #else 1087 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1088 #endif 1089 } 1090 */ 1091 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1092 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1093 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1094 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1095 PetscFunctionReturn(0); 1096 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1097 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1098 for (i = 0; i < m; i++) { 1099 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1100 shift = a->sliidx[i >> 3] + (i & 0x07); 1101 for (j = 0; j < a->rlen[i]; j++) { 1102 #if defined(PETSC_USE_COMPLEX) 1103 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1104 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1105 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1106 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1107 } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1108 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1109 } 1110 #else 1111 if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1112 #endif 1113 } 1114 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1115 } 1116 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1117 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1118 PetscInt cnt = 0, jcnt; 1119 PetscScalar value; 1120 #if defined(PETSC_USE_COMPLEX) 1121 PetscBool realonly = PETSC_TRUE; 1122 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1123 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1124 realonly = PETSC_FALSE; 1125 break; 1126 } 1127 } 1128 #endif 1129 1130 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1131 for (i = 0; i < m; i++) { 1132 jcnt = 0; 1133 shift = a->sliidx[i >> 3] + (i & 0x07); 1134 for (j = 0; j < A->cmap->n; j++) { 1135 if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) { 1136 value = a->val[cnt++]; 1137 jcnt++; 1138 } else { 1139 value = 0.0; 1140 } 1141 #if defined(PETSC_USE_COMPLEX) 1142 if (realonly) { 1143 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1144 } else { 1145 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1146 } 1147 #else 1148 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1149 #endif 1150 } 1151 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1152 } 1153 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1154 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1155 PetscInt fshift = 1; 1156 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1157 #if defined(PETSC_USE_COMPLEX) 1158 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1159 #else 1160 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1161 #endif 1162 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1163 for (i = 0; i < m; i++) { 1164 shift = a->sliidx[i >> 3] + (i & 0x07); 1165 for (j = 0; j < a->rlen[i]; j++) { 1166 #if defined(PETSC_USE_COMPLEX) 1167 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1168 #else 1169 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j])); 1170 #endif 1171 } 1172 } 1173 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1174 } else if (format == PETSC_VIEWER_NATIVE) { 1175 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1176 PetscInt row; 1177 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1178 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 1179 #if defined(PETSC_USE_COMPLEX) 1180 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1181 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1182 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1183 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1184 } else { 1185 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1186 } 1187 #else 1188 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j])); 1189 #endif 1190 } 1191 } 1192 } else { 1193 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1194 if (A->factortype) { 1195 for (i = 0; i < m; i++) { 1196 shift = a->sliidx[i >> 3] + (i & 0x07); 1197 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1198 /* L part */ 1199 for (j = shift; j < a->diag[i]; j += 8) { 1200 #if defined(PETSC_USE_COMPLEX) 1201 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) { 1202 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1203 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) { 1204 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1205 } else { 1206 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1207 } 1208 #else 1209 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1210 #endif 1211 } 1212 /* diagonal */ 1213 j = a->diag[i]; 1214 #if defined(PETSC_USE_COMPLEX) 1215 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1216 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1217 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1218 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1219 } else { 1220 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1221 } 1222 #else 1223 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1224 #endif 1225 1226 /* U part */ 1227 for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) { 1228 #if defined(PETSC_USE_COMPLEX) 1229 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1230 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1231 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1232 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1233 } else { 1234 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1235 } 1236 #else 1237 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1238 #endif 1239 } 1240 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1241 } 1242 } else { 1243 for (i = 0; i < m; i++) { 1244 shift = a->sliidx[i >> 3] + (i & 0x07); 1245 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1246 for (j = 0; j < a->rlen[i]; j++) { 1247 #if defined(PETSC_USE_COMPLEX) 1248 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1249 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1250 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1251 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1252 } else { 1253 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1254 } 1255 #else 1256 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1257 #endif 1258 } 1259 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1260 } 1261 } 1262 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1263 } 1264 PetscCall(PetscViewerFlush(viewer)); 1265 PetscFunctionReturn(0); 1266 } 1267 1268 #include <petscdraw.h> 1269 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1270 { 1271 Mat A = (Mat)Aa; 1272 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1273 PetscInt i, j, m = A->rmap->n, shift; 1274 int color; 1275 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1276 PetscViewer viewer; 1277 PetscViewerFormat format; 1278 1279 PetscFunctionBegin; 1280 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1281 PetscCall(PetscViewerGetFormat(viewer, &format)); 1282 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1283 1284 /* loop over matrix elements drawing boxes */ 1285 1286 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1287 PetscDrawCollectiveBegin(draw); 1288 /* Blue for negative, Cyan for zero and Red for positive */ 1289 color = PETSC_DRAW_BLUE; 1290 for (i = 0; i < m; i++) { 1291 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1292 y_l = m - i - 1.0; 1293 y_r = y_l + 1.0; 1294 for (j = 0; j < a->rlen[i]; j++) { 1295 x_l = a->colidx[shift + j * 8]; 1296 x_r = x_l + 1.0; 1297 if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue; 1298 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1299 } 1300 } 1301 color = PETSC_DRAW_CYAN; 1302 for (i = 0; i < m; i++) { 1303 shift = a->sliidx[i >> 3] + (i & 0x07); 1304 y_l = m - i - 1.0; 1305 y_r = y_l + 1.0; 1306 for (j = 0; j < a->rlen[i]; j++) { 1307 x_l = a->colidx[shift + j * 8]; 1308 x_r = x_l + 1.0; 1309 if (a->val[shift + 8 * j] != 0.) continue; 1310 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1311 } 1312 } 1313 color = PETSC_DRAW_RED; 1314 for (i = 0; i < m; i++) { 1315 shift = a->sliidx[i >> 3] + (i & 0x07); 1316 y_l = m - i - 1.0; 1317 y_r = y_l + 1.0; 1318 for (j = 0; j < a->rlen[i]; j++) { 1319 x_l = a->colidx[shift + j * 8]; 1320 x_r = x_l + 1.0; 1321 if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue; 1322 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1323 } 1324 } 1325 PetscDrawCollectiveEnd(draw); 1326 } else { 1327 /* use contour shading to indicate magnitude of values */ 1328 /* first determine max of all nonzero values */ 1329 PetscReal minv = 0.0, maxv = 0.0; 1330 PetscInt count = 0; 1331 PetscDraw popup; 1332 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1333 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1334 } 1335 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1336 PetscCall(PetscDrawGetPopup(draw, &popup)); 1337 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1338 1339 PetscDrawCollectiveBegin(draw); 1340 for (i = 0; i < m; i++) { 1341 shift = a->sliidx[i >> 3] + (i & 0x07); 1342 y_l = m - i - 1.0; 1343 y_r = y_l + 1.0; 1344 for (j = 0; j < a->rlen[i]; j++) { 1345 x_l = a->colidx[shift + j * 8]; 1346 x_r = x_l + 1.0; 1347 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1348 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1349 count++; 1350 } 1351 } 1352 PetscDrawCollectiveEnd(draw); 1353 } 1354 PetscFunctionReturn(0); 1355 } 1356 1357 #include <petscdraw.h> 1358 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1359 { 1360 PetscDraw draw; 1361 PetscReal xr, yr, xl, yl, h, w; 1362 PetscBool isnull; 1363 1364 PetscFunctionBegin; 1365 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1366 PetscCall(PetscDrawIsNull(draw, &isnull)); 1367 if (isnull) PetscFunctionReturn(0); 1368 1369 xr = A->cmap->n; 1370 yr = A->rmap->n; 1371 h = yr / 10.0; 1372 w = xr / 10.0; 1373 xr += w; 1374 yr += h; 1375 xl = -w; 1376 yl = -h; 1377 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1378 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1379 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1380 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1381 PetscCall(PetscDrawSave(draw)); 1382 PetscFunctionReturn(0); 1383 } 1384 1385 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1386 { 1387 PetscBool iascii, isbinary, isdraw; 1388 1389 PetscFunctionBegin; 1390 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1391 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1392 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1393 if (iascii) { 1394 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1395 } else if (isbinary) { 1396 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1397 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1398 PetscFunctionReturn(0); 1399 } 1400 1401 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1402 { 1403 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1404 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1405 MatScalar *vp; 1406 1407 PetscFunctionBegin; 1408 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1409 /* To do: compress out the unused elements */ 1410 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1411 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1412 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1413 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1414 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1415 for (i = 0; i < a->totalslices; ++i) { 1416 shift = a->sliidx[i]; /* starting index of the slice */ 1417 cp = a->colidx + shift; /* pointer to the column indices of the slice */ 1418 vp = a->val + shift; /* pointer to the nonzero values of the slice */ 1419 for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */ 1420 row = 8 * i + row_in_slice; 1421 nrow = a->rlen[row]; /* number of nonzeros in row */ 1422 /* 1423 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1424 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1425 */ 1426 lastcol = 0; 1427 if (nrow > 0) { /* nonempty row */ 1428 lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1429 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1430 for (j = 1; j < 8; j++) { 1431 if (a->rlen[8 * i + j]) { 1432 lastcol = cp[j]; 1433 break; 1434 } 1435 } 1436 } else { 1437 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1438 } 1439 1440 for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) { 1441 cp[8 * k + row_in_slice] = lastcol; 1442 vp[8 * k + row_in_slice] = (MatScalar)0; 1443 } 1444 } 1445 } 1446 1447 A->info.mallocs += a->reallocs; 1448 a->reallocs = 0; 1449 1450 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1451 PetscFunctionReturn(0); 1452 } 1453 1454 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1455 { 1456 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1457 1458 PetscFunctionBegin; 1459 info->block_size = 1.0; 1460 info->nz_allocated = a->maxallocmat; 1461 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1462 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1463 info->assemblies = A->num_ass; 1464 info->mallocs = A->info.mallocs; 1465 info->memory = 0; /* REVIEW ME */ 1466 if (A->factortype) { 1467 info->fill_ratio_given = A->info.fill_ratio_given; 1468 info->fill_ratio_needed = A->info.fill_ratio_needed; 1469 info->factor_mallocs = A->info.factor_mallocs; 1470 } else { 1471 info->fill_ratio_given = 0; 1472 info->fill_ratio_needed = 0; 1473 info->factor_mallocs = 0; 1474 } 1475 PetscFunctionReturn(0); 1476 } 1477 1478 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1479 { 1480 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1481 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1482 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1483 MatScalar *vp, value; 1484 1485 PetscFunctionBegin; 1486 for (k = 0; k < m; k++) { /* loop over added rows */ 1487 row = im[k]; 1488 if (row < 0) continue; 1489 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1490 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 1491 cp = a->colidx + shift; /* pointer to the row */ 1492 vp = a->val + shift; /* pointer to the row */ 1493 nrow = a->rlen[row]; 1494 low = 0; 1495 high = nrow; 1496 1497 for (l = 0; l < n; l++) { /* loop over added columns */ 1498 col = in[l]; 1499 if (col < 0) continue; 1500 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1501 if (a->roworiented) { 1502 value = v[l + k * n]; 1503 } else { 1504 value = v[k + l * m]; 1505 } 1506 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1507 1508 /* search in this row for the specified column, i indicates the column to be set */ 1509 if (col <= lastcol) low = 0; 1510 else high = nrow; 1511 lastcol = col; 1512 while (high - low > 5) { 1513 t = (low + high) / 2; 1514 if (*(cp + t * 8) > col) high = t; 1515 else low = t; 1516 } 1517 for (i = low; i < high; i++) { 1518 if (*(cp + i * 8) > col) break; 1519 if (*(cp + i * 8) == col) { 1520 if (is == ADD_VALUES) *(vp + i * 8) += value; 1521 else *(vp + i * 8) = value; 1522 low = i + 1; 1523 goto noinsert; 1524 } 1525 } 1526 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1527 if (nonew == 1) goto noinsert; 1528 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1529 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1530 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar); 1531 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1532 for (ii = nrow - 1; ii >= i; ii--) { 1533 *(cp + (ii + 1) * 8) = *(cp + ii * 8); 1534 *(vp + (ii + 1) * 8) = *(vp + ii * 8); 1535 } 1536 a->rlen[row]++; 1537 *(cp + i * 8) = col; 1538 *(vp + i * 8) = value; 1539 a->nz++; 1540 A->nonzerostate++; 1541 low = i + 1; 1542 high++; 1543 nrow++; 1544 noinsert:; 1545 } 1546 a->rlen[row] = nrow; 1547 } 1548 PetscFunctionReturn(0); 1549 } 1550 1551 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1552 { 1553 PetscFunctionBegin; 1554 /* If the two matrices have the same copy implementation, use fast copy. */ 1555 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1556 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1557 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1558 1559 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1560 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1561 } else { 1562 PetscCall(MatCopy_Basic(A, B, str)); 1563 } 1564 PetscFunctionReturn(0); 1565 } 1566 1567 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1568 { 1569 PetscFunctionBegin; 1570 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1571 PetscFunctionReturn(0); 1572 } 1573 1574 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1575 { 1576 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1577 1578 PetscFunctionBegin; 1579 *array = a->val; 1580 PetscFunctionReturn(0); 1581 } 1582 1583 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1584 { 1585 PetscFunctionBegin; 1586 PetscFunctionReturn(0); 1587 } 1588 1589 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1590 { 1591 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1592 PetscInt i; 1593 MatScalar *aval = a->val; 1594 1595 PetscFunctionBegin; 1596 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]); 1597 PetscFunctionReturn(0); 1598 } 1599 1600 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1601 { 1602 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1603 PetscInt i; 1604 MatScalar *aval = a->val; 1605 1606 PetscFunctionBegin; 1607 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1608 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1609 PetscFunctionReturn(0); 1610 } 1611 1612 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1613 { 1614 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1615 MatScalar *aval = a->val; 1616 PetscScalar oalpha = alpha; 1617 PetscBLASInt one = 1, size; 1618 1619 PetscFunctionBegin; 1620 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1621 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1622 PetscCall(PetscLogFlops(a->nz)); 1623 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1624 PetscFunctionReturn(0); 1625 } 1626 1627 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1628 { 1629 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1630 1631 PetscFunctionBegin; 1632 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1633 PetscCall(MatShift_Basic(Y, a)); 1634 PetscFunctionReturn(0); 1635 } 1636 1637 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1638 { 1639 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1640 PetscScalar *x, sum, *t; 1641 const MatScalar *idiag = NULL, *mdiag; 1642 const PetscScalar *b, *xb; 1643 PetscInt n, m = A->rmap->n, i, j, shift; 1644 const PetscInt *diag; 1645 1646 PetscFunctionBegin; 1647 its = its * lits; 1648 1649 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1650 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1651 a->fshift = fshift; 1652 a->omega = omega; 1653 1654 diag = a->diag; 1655 t = a->ssor_work; 1656 idiag = a->idiag; 1657 mdiag = a->mdiag; 1658 1659 PetscCall(VecGetArray(xx, &x)); 1660 PetscCall(VecGetArrayRead(bb, &b)); 1661 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1662 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1663 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1664 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1665 1666 if (flag & SOR_ZERO_INITIAL_GUESS) { 1667 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1668 for (i = 0; i < m; i++) { 1669 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1670 sum = b[i]; 1671 n = (diag[i] - shift) / 8; 1672 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1673 t[i] = sum; 1674 x[i] = sum * idiag[i]; 1675 } 1676 xb = t; 1677 PetscCall(PetscLogFlops(a->nz)); 1678 } else xb = b; 1679 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1680 for (i = m - 1; i >= 0; i--) { 1681 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1682 sum = xb[i]; 1683 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1684 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1685 if (xb == b) { 1686 x[i] = sum * idiag[i]; 1687 } else { 1688 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1689 } 1690 } 1691 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1692 } 1693 its--; 1694 } 1695 while (its--) { 1696 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1697 for (i = 0; i < m; i++) { 1698 /* lower */ 1699 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1700 sum = b[i]; 1701 n = (diag[i] - shift) / 8; 1702 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1703 t[i] = sum; /* save application of the lower-triangular part */ 1704 /* upper */ 1705 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1706 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1707 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1708 } 1709 xb = t; 1710 PetscCall(PetscLogFlops(2.0 * a->nz)); 1711 } else xb = b; 1712 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1713 for (i = m - 1; i >= 0; i--) { 1714 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1715 sum = xb[i]; 1716 if (xb == b) { 1717 /* whole matrix (no checkpointing available) */ 1718 n = a->rlen[i]; 1719 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1720 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1721 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1722 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1723 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1724 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1725 } 1726 } 1727 if (xb == b) { 1728 PetscCall(PetscLogFlops(2.0 * a->nz)); 1729 } else { 1730 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1731 } 1732 } 1733 } 1734 PetscCall(VecRestoreArray(xx, &x)); 1735 PetscCall(VecRestoreArrayRead(bb, &b)); 1736 PetscFunctionReturn(0); 1737 } 1738 1739 /* -------------------------------------------------------------------*/ 1740 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1741 MatGetRow_SeqSELL, 1742 MatRestoreRow_SeqSELL, 1743 MatMult_SeqSELL, 1744 /* 4*/ MatMultAdd_SeqSELL, 1745 MatMultTranspose_SeqSELL, 1746 MatMultTransposeAdd_SeqSELL, 1747 NULL, 1748 NULL, 1749 NULL, 1750 /* 10*/ NULL, 1751 NULL, 1752 NULL, 1753 MatSOR_SeqSELL, 1754 NULL, 1755 /* 15*/ MatGetInfo_SeqSELL, 1756 MatEqual_SeqSELL, 1757 MatGetDiagonal_SeqSELL, 1758 MatDiagonalScale_SeqSELL, 1759 NULL, 1760 /* 20*/ NULL, 1761 MatAssemblyEnd_SeqSELL, 1762 MatSetOption_SeqSELL, 1763 MatZeroEntries_SeqSELL, 1764 /* 24*/ NULL, 1765 NULL, 1766 NULL, 1767 NULL, 1768 NULL, 1769 /* 29*/ MatSetUp_SeqSELL, 1770 NULL, 1771 NULL, 1772 NULL, 1773 NULL, 1774 /* 34*/ MatDuplicate_SeqSELL, 1775 NULL, 1776 NULL, 1777 NULL, 1778 NULL, 1779 /* 39*/ NULL, 1780 NULL, 1781 NULL, 1782 MatGetValues_SeqSELL, 1783 MatCopy_SeqSELL, 1784 /* 44*/ NULL, 1785 MatScale_SeqSELL, 1786 MatShift_SeqSELL, 1787 NULL, 1788 NULL, 1789 /* 49*/ NULL, 1790 NULL, 1791 NULL, 1792 NULL, 1793 NULL, 1794 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1795 NULL, 1796 NULL, 1797 NULL, 1798 NULL, 1799 /* 59*/ NULL, 1800 MatDestroy_SeqSELL, 1801 MatView_SeqSELL, 1802 NULL, 1803 NULL, 1804 /* 64*/ NULL, 1805 NULL, 1806 NULL, 1807 NULL, 1808 NULL, 1809 /* 69*/ NULL, 1810 NULL, 1811 NULL, 1812 NULL, 1813 NULL, 1814 /* 74*/ NULL, 1815 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1816 NULL, 1817 NULL, 1818 NULL, 1819 /* 79*/ NULL, 1820 NULL, 1821 NULL, 1822 NULL, 1823 NULL, 1824 /* 84*/ NULL, 1825 NULL, 1826 NULL, 1827 NULL, 1828 NULL, 1829 /* 89*/ NULL, 1830 NULL, 1831 NULL, 1832 NULL, 1833 NULL, 1834 /* 94*/ NULL, 1835 NULL, 1836 NULL, 1837 NULL, 1838 NULL, 1839 /* 99*/ NULL, 1840 NULL, 1841 NULL, 1842 MatConjugate_SeqSELL, 1843 NULL, 1844 /*104*/ NULL, 1845 NULL, 1846 NULL, 1847 NULL, 1848 NULL, 1849 /*109*/ NULL, 1850 NULL, 1851 NULL, 1852 NULL, 1853 MatMissingDiagonal_SeqSELL, 1854 /*114*/ NULL, 1855 NULL, 1856 NULL, 1857 NULL, 1858 NULL, 1859 /*119*/ NULL, 1860 NULL, 1861 NULL, 1862 NULL, 1863 NULL, 1864 /*124*/ NULL, 1865 NULL, 1866 NULL, 1867 NULL, 1868 NULL, 1869 /*129*/ NULL, 1870 NULL, 1871 NULL, 1872 NULL, 1873 NULL, 1874 /*134*/ NULL, 1875 NULL, 1876 NULL, 1877 NULL, 1878 NULL, 1879 /*139*/ NULL, 1880 NULL, 1881 NULL, 1882 MatFDColoringSetUp_SeqXAIJ, 1883 NULL, 1884 /*144*/ NULL, 1885 NULL, 1886 NULL, 1887 NULL, 1888 NULL, 1889 NULL, 1890 /*150*/ NULL}; 1891 1892 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1893 { 1894 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1895 1896 PetscFunctionBegin; 1897 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1898 1899 /* allocate space for values if not already there */ 1900 if (!a->saved_values) { PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); } 1901 1902 /* copy values over */ 1903 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1904 PetscFunctionReturn(0); 1905 } 1906 1907 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1908 { 1909 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1910 1911 PetscFunctionBegin; 1912 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1913 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1914 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1915 PetscFunctionReturn(0); 1916 } 1917 1918 /*@C 1919 MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()` 1920 1921 Not Collective 1922 1923 Input Parameters: 1924 . mat - a `MATSEQSELL` matrix 1925 . array - pointer to the data 1926 1927 Level: intermediate 1928 1929 .seealso: `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 1930 @*/ 1931 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) 1932 { 1933 PetscFunctionBegin; 1934 PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array)); 1935 PetscFunctionReturn(0); 1936 } 1937 1938 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 1939 { 1940 Mat_SeqSELL *b; 1941 PetscMPIInt size; 1942 1943 PetscFunctionBegin; 1944 PetscCall(PetscCitationsRegister(citation, &cited)); 1945 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 1946 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 1947 1948 PetscCall(PetscNew(&b)); 1949 1950 B->data = (void *)b; 1951 1952 PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps))); 1953 1954 b->row = NULL; 1955 b->col = NULL; 1956 b->icol = NULL; 1957 b->reallocs = 0; 1958 b->ignorezeroentries = PETSC_FALSE; 1959 b->roworiented = PETSC_TRUE; 1960 b->nonew = 0; 1961 b->diag = NULL; 1962 b->solve_work = NULL; 1963 B->spptr = NULL; 1964 b->saved_values = NULL; 1965 b->idiag = NULL; 1966 b->mdiag = NULL; 1967 b->ssor_work = NULL; 1968 b->omega = 1.0; 1969 b->fshift = 0.0; 1970 b->idiagvalid = PETSC_FALSE; 1971 b->keepnonzeropattern = PETSC_FALSE; 1972 1973 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 1974 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 1975 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 1976 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 1977 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 1978 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 1979 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 1980 PetscFunctionReturn(0); 1981 } 1982 1983 /* 1984 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1985 */ 1986 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 1987 { 1988 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 1989 PetscInt i, m = A->rmap->n; 1990 PetscInt totalslices = a->totalslices; 1991 1992 PetscFunctionBegin; 1993 C->factortype = A->factortype; 1994 c->row = NULL; 1995 c->col = NULL; 1996 c->icol = NULL; 1997 c->reallocs = 0; 1998 C->assembled = PETSC_TRUE; 1999 2000 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2001 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2002 2003 PetscCall(PetscMalloc1(8 * totalslices, &c->rlen)); 2004 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2005 2006 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2007 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2008 2009 /* allocate the matrix space */ 2010 if (mallocmatspace) { 2011 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2012 2013 c->singlemalloc = PETSC_TRUE; 2014 2015 if (m > 0) { 2016 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2017 if (cpvalues == MAT_COPY_VALUES) { 2018 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2019 } else { 2020 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2021 } 2022 } 2023 } 2024 2025 c->ignorezeroentries = a->ignorezeroentries; 2026 c->roworiented = a->roworiented; 2027 c->nonew = a->nonew; 2028 if (a->diag) { 2029 PetscCall(PetscMalloc1(m, &c->diag)); 2030 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2031 } else c->diag = NULL; 2032 2033 c->solve_work = NULL; 2034 c->saved_values = NULL; 2035 c->idiag = NULL; 2036 c->ssor_work = NULL; 2037 c->keepnonzeropattern = a->keepnonzeropattern; 2038 c->free_val = PETSC_TRUE; 2039 c->free_colidx = PETSC_TRUE; 2040 2041 c->maxallocmat = a->maxallocmat; 2042 c->maxallocrow = a->maxallocrow; 2043 c->rlenmax = a->rlenmax; 2044 c->nz = a->nz; 2045 C->preallocated = PETSC_TRUE; 2046 2047 c->nonzerorowcnt = a->nonzerorowcnt; 2048 C->nonzerostate = A->nonzerostate; 2049 2050 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2051 PetscFunctionReturn(0); 2052 } 2053 2054 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2055 { 2056 PetscFunctionBegin; 2057 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2058 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2059 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2060 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2061 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2062 PetscFunctionReturn(0); 2063 } 2064 2065 /*MC 2066 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2067 based on the sliced Ellpack format 2068 2069 Options Database Keys: 2070 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2071 2072 Level: beginner 2073 2074 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2075 M*/ 2076 2077 /*MC 2078 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2079 2080 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2081 and `MATMPISELL` otherwise. As a result, for single process communicators, 2082 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2083 for communicators controlling multiple processes. It is recommended that you call both of 2084 the above preallocation routines for simplicity. 2085 2086 Options Database Keys: 2087 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2088 2089 Level: beginner 2090 2091 Notes: 2092 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2093 2094 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2095 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2096 2097 Developer Notes: 2098 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2099 2100 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2101 .vb 2102 (2 0 3 4) 2103 Consider the matrix A = (5 0 6 0) 2104 (0 0 7 8) 2105 (0 0 9 9) 2106 2107 symbolically the Ellpack format can be written as 2108 2109 (2 3 4 |) (0 2 3 |) 2110 v = (5 6 0 |) colidx = (0 2 2 |) 2111 -------- --------- 2112 (7 8 |) (2 3 |) 2113 (9 9 |) (2 3 |) 2114 2115 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2116 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2117 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2118 2119 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2120 2121 .ve 2122 2123 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2124 2125 References: 2126 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2127 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2128 2129 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2130 M*/ 2131 2132 /*@C 2133 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2134 2135 Collective on comm 2136 2137 Input Parameters: 2138 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2139 . m - number of rows 2140 . n - number of columns 2141 . rlenmax - maximum number of nonzeros in a row 2142 - rlen - array containing the number of nonzeros in the various rows 2143 (possibly different for each row) or NULL 2144 2145 Output Parameter: 2146 . A - the matrix 2147 2148 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2149 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2150 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2151 2152 Notes: 2153 If nnz is given then nz is ignored 2154 2155 Specify the preallocated storage with either rlenmax or rlen (not both). 2156 Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory 2157 allocation. For large problems you MUST preallocate memory or you 2158 will get TERRIBLE performance, see the users' manual chapter on matrices. 2159 2160 Level: intermediate 2161 2162 .seealso: `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2163 @*/ 2164 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A) 2165 { 2166 PetscFunctionBegin; 2167 PetscCall(MatCreate(comm, A)); 2168 PetscCall(MatSetSizes(*A, m, n, m, n)); 2169 PetscCall(MatSetType(*A, MATSEQSELL)); 2170 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen)); 2171 PetscFunctionReturn(0); 2172 } 2173 2174 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2175 { 2176 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2177 PetscInt totalslices = a->totalslices; 2178 2179 PetscFunctionBegin; 2180 /* If the matrix dimensions are not equal,or no of nonzeros */ 2181 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2182 *flg = PETSC_FALSE; 2183 PetscFunctionReturn(0); 2184 } 2185 /* if the a->colidx are the same */ 2186 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2187 if (!*flg) PetscFunctionReturn(0); 2188 /* if a->val are the same */ 2189 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2190 PetscFunctionReturn(0); 2191 } 2192 2193 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2194 { 2195 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2196 2197 PetscFunctionBegin; 2198 a->idiagvalid = PETSC_FALSE; 2199 PetscFunctionReturn(0); 2200 } 2201 2202 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2203 { 2204 #if defined(PETSC_USE_COMPLEX) 2205 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2206 PetscInt i; 2207 PetscScalar *val = a->val; 2208 2209 PetscFunctionBegin; 2210 for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]); 2211 #else 2212 PetscFunctionBegin; 2213 #endif 2214 PetscFunctionReturn(0); 2215 } 2216