1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 11 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 12 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 13 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 14 " year = 2018\n" 15 "}\n"; 16 17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 18 19 #include <immintrin.h> 20 21 #if !defined(_MM_SCALE_8) 22 #define _MM_SCALE_8 8 23 #endif 24 25 #if defined(__AVX512F__) 26 /* these do not work 27 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 28 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 29 */ 30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 31 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 32 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 33 vec_vals = _mm512_loadu_pd(aval); \ 34 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 35 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 36 #elif defined(__AVX2__) && defined(__FMA__) 37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 38 vec_vals = _mm256_loadu_pd(aval); \ 39 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 40 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 41 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 42 #endif 43 #endif /* PETSC_HAVE_IMMINTRIN_H */ 44 45 /*@C 46 MatSeqSELLSetPreallocation - For good matrix assembly performance 47 the user should preallocate the matrix storage by setting the parameter nz 48 (or the array nnz). By setting these parameters accurately, performance 49 during matrix assembly can be increased significantly. 50 51 Collective 52 53 Input Parameters: 54 + B - The matrix 55 . nz - number of nonzeros per row (same for all rows) 56 - nnz - array containing the number of nonzeros in the various rows 57 (possibly different for each row) or NULL 58 59 Notes: 60 If nnz is given then nz is ignored. 61 62 Specify the preallocated storage with either nz or nnz (not both). 63 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 64 allocation. For large problems you MUST preallocate memory or you 65 will get TERRIBLE performance, see the users' manual chapter on matrices. 66 67 You can call MatGetInfo() to get information on how effective the preallocation was; 68 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 69 You can also run with the option -info and look for messages with the string 70 malloc in them to see if additional memory allocation was needed. 71 72 Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix 73 entries or columns indices. 74 75 The maximum number of nonzeos in any row should be as accurate as possible. 76 If it is underestimated, you will get bad performance due to reallocation 77 (MatSeqXSELLReallocateSELL). 78 79 Level: intermediate 80 81 .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 82 83 @*/ 84 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) { 85 PetscFunctionBegin; 86 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 87 PetscValidType(B, 1); 88 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 89 PetscFunctionReturn(0); 90 } 91 92 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) { 93 Mat_SeqSELL *b; 94 PetscInt i, j, totalslices; 95 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 96 97 PetscFunctionBegin; 98 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 99 if (maxallocrow == MAT_SKIP_ALLOCATION) { 100 skipallocation = PETSC_TRUE; 101 maxallocrow = 0; 102 } 103 104 PetscCall(PetscLayoutSetUp(B->rmap)); 105 PetscCall(PetscLayoutSetUp(B->cmap)); 106 107 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 108 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 109 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 110 if (rlen) { 111 for (i = 0; i < B->rmap->n; i++) { 112 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 113 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 114 } 115 } 116 117 B->preallocated = PETSC_TRUE; 118 119 b = (Mat_SeqSELL *)B->data; 120 121 totalslices = PetscCeilInt(B->rmap->n, 8); 122 b->totalslices = totalslices; 123 if (!skipallocation) { 124 if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n)); 125 126 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 127 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 128 PetscCall(PetscLogObjectMemory((PetscObject)B, (totalslices + 1) * sizeof(PetscInt))); 129 } 130 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 131 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 132 else if (maxallocrow < 0) maxallocrow = 1; 133 for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow; 134 } else { 135 maxallocrow = 0; 136 b->sliidx[0] = 0; 137 for (i = 1; i < totalslices; i++) { 138 b->sliidx[i] = 0; 139 for (j = 0; j < 8; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]); } 140 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 141 PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i])); 142 } 143 /* last slice */ 144 b->sliidx[totalslices] = 0; 145 for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 146 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 147 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices]; 148 } 149 150 /* allocate space for val, colidx, rlen */ 151 /* FIXME: should B's old memory be unlogged? */ 152 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 153 /* FIXME: assuming an element of the bit array takes 8 bits */ 154 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 155 PetscCall(PetscLogObjectMemory((PetscObject)B, b->sliidx[totalslices] * (sizeof(PetscScalar) + sizeof(PetscInt)))); 156 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 157 PetscCall(PetscCalloc1(8 * totalslices, &b->rlen)); 158 PetscCall(PetscLogObjectMemory((PetscObject)B, 8 * totalslices * sizeof(PetscInt))); 159 160 b->singlemalloc = PETSC_TRUE; 161 b->free_val = PETSC_TRUE; 162 b->free_colidx = PETSC_TRUE; 163 } else { 164 b->free_val = PETSC_FALSE; 165 b->free_colidx = PETSC_FALSE; 166 } 167 168 b->nz = 0; 169 b->maxallocrow = maxallocrow; 170 b->rlenmax = maxallocrow; 171 b->maxallocmat = b->sliidx[totalslices]; 172 B->info.nz_unneeded = (double)b->maxallocmat; 173 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 174 PetscFunctionReturn(0); 175 } 176 177 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) { 178 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 179 PetscInt shift; 180 181 PetscFunctionBegin; 182 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 183 if (nz) *nz = a->rlen[row]; 184 shift = a->sliidx[row >> 3] + (row & 0x07); 185 if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); } 186 if (idx) { 187 PetscInt j; 188 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j]; 189 *idx = a->getrowcols; 190 } 191 if (v) { 192 PetscInt j; 193 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j]; 194 *v = a->getrowvals; 195 } 196 PetscFunctionReturn(0); 197 } 198 199 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) { 200 PetscFunctionBegin; 201 PetscFunctionReturn(0); 202 } 203 204 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) { 205 Mat B; 206 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 207 PetscInt i; 208 209 PetscFunctionBegin; 210 if (reuse == MAT_REUSE_MATRIX) { 211 B = *newmat; 212 PetscCall(MatZeroEntries(B)); 213 } else { 214 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 215 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 216 PetscCall(MatSetType(B, MATSEQAIJ)); 217 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 218 } 219 220 for (i = 0; i < A->rmap->n; i++) { 221 PetscInt nz = 0, *cols = NULL; 222 PetscScalar *vals = NULL; 223 224 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 225 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 226 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 227 } 228 229 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 230 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 231 B->rmap->bs = A->rmap->bs; 232 233 if (reuse == MAT_INPLACE_MATRIX) { 234 PetscCall(MatHeaderReplace(A, &B)); 235 } else { 236 *newmat = B; 237 } 238 PetscFunctionReturn(0); 239 } 240 241 #include <../src/mat/impls/aij/seq/aij.h> 242 243 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) { 244 Mat B; 245 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 246 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 247 const PetscInt *cols; 248 const PetscScalar *vals; 249 250 PetscFunctionBegin; 251 252 if (reuse == MAT_REUSE_MATRIX) { 253 B = *newmat; 254 } else { 255 if (PetscDefined(USE_DEBUG) || !a->ilen) { 256 PetscCall(PetscMalloc1(m, &rowlengths)); 257 for (i = 0; i < m; i++) { rowlengths[i] = ai[i + 1] - ai[i]; } 258 } 259 if (PetscDefined(USE_DEBUG) && a->ilen) { 260 PetscBool eq; 261 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 262 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 263 PetscCall(PetscFree(rowlengths)); 264 rowlengths = a->ilen; 265 } else if (a->ilen) rowlengths = a->ilen; 266 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 267 PetscCall(MatSetSizes(B, m, n, m, n)); 268 PetscCall(MatSetType(B, MATSEQSELL)); 269 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 270 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 271 } 272 273 for (row = 0; row < m; row++) { 274 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 275 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 276 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 277 } 278 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 279 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 280 B->rmap->bs = A->rmap->bs; 281 282 if (reuse == MAT_INPLACE_MATRIX) { 283 PetscCall(MatHeaderReplace(A, &B)); 284 } else { 285 *newmat = B; 286 } 287 PetscFunctionReturn(0); 288 } 289 290 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) { 291 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 292 PetscScalar *y; 293 const PetscScalar *x; 294 const MatScalar *aval = a->val; 295 PetscInt totalslices = a->totalslices; 296 const PetscInt *acolidx = a->colidx; 297 PetscInt i, j; 298 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 299 __m512d vec_x, vec_y, vec_vals; 300 __m256i vec_idx; 301 __mmask8 mask; 302 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 303 __m256i vec_idx2, vec_idx3, vec_idx4; 304 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 305 __m128i vec_idx; 306 __m256d vec_x, vec_y, vec_y2, vec_vals; 307 MatScalar yval; 308 PetscInt r, rows_left, row, nnz_in_row; 309 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 310 __m128d vec_x_tmp; 311 __m256d vec_x, vec_y, vec_y2, vec_vals; 312 MatScalar yval; 313 PetscInt r, rows_left, row, nnz_in_row; 314 #else 315 PetscScalar sum[8]; 316 #endif 317 318 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 319 #pragma disjoint(*x, *y, *aval) 320 #endif 321 322 PetscFunctionBegin; 323 PetscCall(VecGetArrayRead(xx, &x)); 324 PetscCall(VecGetArray(yy, &y)); 325 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 326 for (i = 0; i < totalslices; i++) { /* loop over slices */ 327 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 328 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 329 330 vec_y = _mm512_setzero_pd(); 331 vec_y2 = _mm512_setzero_pd(); 332 vec_y3 = _mm512_setzero_pd(); 333 vec_y4 = _mm512_setzero_pd(); 334 335 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 336 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 337 case 3: 338 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 339 acolidx += 8; 340 aval += 8; 341 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 342 acolidx += 8; 343 aval += 8; 344 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 345 acolidx += 8; 346 aval += 8; 347 j += 3; 348 break; 349 case 2: 350 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 351 acolidx += 8; 352 aval += 8; 353 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 354 acolidx += 8; 355 aval += 8; 356 j += 2; 357 break; 358 case 1: 359 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 360 acolidx += 8; 361 aval += 8; 362 j += 1; 363 break; 364 } 365 #pragma novector 366 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 367 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 368 acolidx += 8; 369 aval += 8; 370 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 371 acolidx += 8; 372 aval += 8; 373 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 374 acolidx += 8; 375 aval += 8; 376 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 377 acolidx += 8; 378 aval += 8; 379 } 380 381 vec_y = _mm512_add_pd(vec_y, vec_y2); 382 vec_y = _mm512_add_pd(vec_y, vec_y3); 383 vec_y = _mm512_add_pd(vec_y, vec_y4); 384 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 385 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 386 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 387 } else { 388 _mm512_storeu_pd(&y[8 * i], vec_y); 389 } 390 } 391 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 392 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 393 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 394 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 395 396 /* last slice may have padding rows. Don't use vectorization. */ 397 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 398 rows_left = A->rmap->n - 8 * i; 399 for (r = 0; r < rows_left; ++r) { 400 yval = (MatScalar)0; 401 row = 8 * i + r; 402 nnz_in_row = a->rlen[row]; 403 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 404 y[row] = yval; 405 } 406 break; 407 } 408 409 vec_y = _mm256_setzero_pd(); 410 vec_y2 = _mm256_setzero_pd(); 411 412 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 413 #pragma novector 414 #pragma unroll(2) 415 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 416 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 417 aval += 4; 418 acolidx += 4; 419 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 420 aval += 4; 421 acolidx += 4; 422 } 423 424 _mm256_storeu_pd(y + i * 8, vec_y); 425 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 426 } 427 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 428 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 429 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 430 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 431 432 vec_y = _mm256_setzero_pd(); 433 vec_y2 = _mm256_setzero_pd(); 434 435 /* last slice may have padding rows. Don't use vectorization. */ 436 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 437 rows_left = A->rmap->n - 8 * i; 438 for (r = 0; r < rows_left; ++r) { 439 yval = (MatScalar)0; 440 row = 8 * i + r; 441 nnz_in_row = a->rlen[row]; 442 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 443 y[row] = yval; 444 } 445 break; 446 } 447 448 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 449 #pragma novector 450 #pragma unroll(2) 451 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 452 vec_vals = _mm256_loadu_pd(aval); 453 vec_x_tmp = _mm_setzero_pd(); 454 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 455 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 456 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 457 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 458 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 459 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 460 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 461 aval += 4; 462 463 vec_vals = _mm256_loadu_pd(aval); 464 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 465 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 466 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 467 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 468 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 469 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 470 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 471 aval += 4; 472 } 473 474 _mm256_storeu_pd(y + i * 8, vec_y); 475 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 476 } 477 #else 478 for (i = 0; i < totalslices; i++) { /* loop over slices */ 479 for (j = 0; j < 8; j++) sum[j] = 0.0; 480 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 481 sum[0] += aval[j] * x[acolidx[j]]; 482 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 483 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 484 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 485 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 486 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 487 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 488 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 489 } 490 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 491 for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j]; 492 } else { 493 for (j = 0; j < 8; j++) y[8 * i + j] = sum[j]; 494 } 495 } 496 #endif 497 498 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 499 PetscCall(VecRestoreArrayRead(xx, &x)); 500 PetscCall(VecRestoreArray(yy, &y)); 501 PetscFunctionReturn(0); 502 } 503 504 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 505 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) { 506 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 507 PetscScalar *y, *z; 508 const PetscScalar *x; 509 const MatScalar *aval = a->val; 510 PetscInt totalslices = a->totalslices; 511 const PetscInt *acolidx = a->colidx; 512 PetscInt i, j; 513 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 514 __m512d vec_x, vec_y, vec_vals; 515 __m256i vec_idx; 516 __mmask8 mask; 517 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 518 __m256i vec_idx2, vec_idx3, vec_idx4; 519 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 520 __m128d vec_x_tmp; 521 __m256d vec_x, vec_y, vec_y2, vec_vals; 522 MatScalar yval; 523 PetscInt r, row, nnz_in_row; 524 #else 525 PetscScalar sum[8]; 526 #endif 527 528 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 529 #pragma disjoint(*x, *y, *aval) 530 #endif 531 532 PetscFunctionBegin; 533 PetscCall(VecGetArrayRead(xx, &x)); 534 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 535 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 536 for (i = 0; i < totalslices; i++) { /* loop over slices */ 537 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 538 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 539 540 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 541 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 542 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 543 } else { 544 vec_y = _mm512_loadu_pd(&y[8 * i]); 545 } 546 vec_y2 = _mm512_setzero_pd(); 547 vec_y3 = _mm512_setzero_pd(); 548 vec_y4 = _mm512_setzero_pd(); 549 550 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 551 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 552 case 3: 553 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 554 acolidx += 8; 555 aval += 8; 556 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 557 acolidx += 8; 558 aval += 8; 559 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 560 acolidx += 8; 561 aval += 8; 562 j += 3; 563 break; 564 case 2: 565 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 566 acolidx += 8; 567 aval += 8; 568 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 569 acolidx += 8; 570 aval += 8; 571 j += 2; 572 break; 573 case 1: 574 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 575 acolidx += 8; 576 aval += 8; 577 j += 1; 578 break; 579 } 580 #pragma novector 581 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 582 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 583 acolidx += 8; 584 aval += 8; 585 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 586 acolidx += 8; 587 aval += 8; 588 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 589 acolidx += 8; 590 aval += 8; 591 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 592 acolidx += 8; 593 aval += 8; 594 } 595 596 vec_y = _mm512_add_pd(vec_y, vec_y2); 597 vec_y = _mm512_add_pd(vec_y, vec_y3); 598 vec_y = _mm512_add_pd(vec_y, vec_y4); 599 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 600 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 601 } else { 602 _mm512_storeu_pd(&z[8 * i], vec_y); 603 } 604 } 605 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 606 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 607 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 608 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 609 610 /* last slice may have padding rows. Don't use vectorization. */ 611 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 612 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 613 row = 8 * i + r; 614 yval = (MatScalar)0.0; 615 nnz_in_row = a->rlen[row]; 616 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 617 z[row] = y[row] + yval; 618 } 619 break; 620 } 621 622 vec_y = _mm256_loadu_pd(y + 8 * i); 623 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 624 625 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 626 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 627 vec_vals = _mm256_loadu_pd(aval); 628 vec_x_tmp = _mm_setzero_pd(); 629 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 630 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 631 vec_x = _mm256_setzero_pd(); 632 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 633 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 634 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 635 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 636 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 637 aval += 4; 638 639 vec_vals = _mm256_loadu_pd(aval); 640 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 641 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 642 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 643 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 644 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 645 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 646 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 647 aval += 4; 648 } 649 650 _mm256_storeu_pd(z + i * 8, vec_y); 651 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 652 } 653 #else 654 for (i = 0; i < totalslices; i++) { /* loop over slices */ 655 for (j = 0; j < 8; j++) sum[j] = 0.0; 656 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 657 sum[0] += aval[j] * x[acolidx[j]]; 658 sum[1] += aval[j + 1] * x[acolidx[j + 1]]; 659 sum[2] += aval[j + 2] * x[acolidx[j + 2]]; 660 sum[3] += aval[j + 3] * x[acolidx[j + 3]]; 661 sum[4] += aval[j + 4] * x[acolidx[j + 4]]; 662 sum[5] += aval[j + 5] * x[acolidx[j + 5]]; 663 sum[6] += aval[j + 6] * x[acolidx[j + 6]]; 664 sum[7] += aval[j + 7] * x[acolidx[j + 7]]; 665 } 666 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 667 for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 668 } else { 669 for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j]; 670 } 671 } 672 #endif 673 674 PetscCall(PetscLogFlops(2.0 * a->nz)); 675 PetscCall(VecRestoreArrayRead(xx, &x)); 676 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 677 PetscFunctionReturn(0); 678 } 679 680 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) { 681 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 682 PetscScalar *y; 683 const PetscScalar *x; 684 const MatScalar *aval = a->val; 685 const PetscInt *acolidx = a->colidx; 686 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices; 687 688 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 689 #pragma disjoint(*x, *y, *aval) 690 #endif 691 692 PetscFunctionBegin; 693 if (A->symmetric == PETSC_BOOL3_TRUE) { 694 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 695 PetscFunctionReturn(0); 696 } 697 if (zz != yy) PetscCall(VecCopy(zz, yy)); 698 PetscCall(VecGetArrayRead(xx, &x)); 699 PetscCall(VecGetArray(yy, &y)); 700 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 701 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 702 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 703 row = 8 * i + r; 704 nnz_in_row = a->rlen[row]; 705 for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row]; 706 } 707 break; 708 } 709 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 710 y[acolidx[j]] += aval[j] * x[8 * i]; 711 y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1]; 712 y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2]; 713 y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3]; 714 y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4]; 715 y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5]; 716 y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6]; 717 y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7]; 718 } 719 } 720 PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices])); 721 PetscCall(VecRestoreArrayRead(xx, &x)); 722 PetscCall(VecRestoreArray(yy, &y)); 723 PetscFunctionReturn(0); 724 } 725 726 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) { 727 PetscFunctionBegin; 728 if (A->symmetric == PETSC_BOOL3_TRUE) { 729 PetscCall(MatMult_SeqSELL(A, xx, yy)); 730 } else { 731 PetscCall(VecSet(yy, 0.0)); 732 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 733 } 734 PetscFunctionReturn(0); 735 } 736 737 /* 738 Checks for missing diagonals 739 */ 740 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) { 741 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 742 PetscInt *diag, i; 743 744 PetscFunctionBegin; 745 *missing = PETSC_FALSE; 746 if (A->rmap->n > 0 && !(a->colidx)) { 747 *missing = PETSC_TRUE; 748 if (d) *d = 0; 749 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 750 } else { 751 diag = a->diag; 752 for (i = 0; i < A->rmap->n; i++) { 753 if (diag[i] == -1) { 754 *missing = PETSC_TRUE; 755 if (d) *d = i; 756 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 757 break; 758 } 759 } 760 } 761 PetscFunctionReturn(0); 762 } 763 764 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) { 765 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 766 PetscInt i, j, m = A->rmap->n, shift; 767 768 PetscFunctionBegin; 769 if (!a->diag) { 770 PetscCall(PetscMalloc1(m, &a->diag)); 771 PetscCall(PetscLogObjectMemory((PetscObject)A, m * sizeof(PetscInt))); 772 a->free_diag = PETSC_TRUE; 773 } 774 for (i = 0; i < m; i++) { /* loop over rows */ 775 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 776 a->diag[i] = -1; 777 for (j = 0; j < a->rlen[i]; j++) { 778 if (a->colidx[shift + j * 8] == i) { 779 a->diag[i] = shift + j * 8; 780 break; 781 } 782 } 783 } 784 PetscFunctionReturn(0); 785 } 786 787 /* 788 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 789 */ 790 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) { 791 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 792 PetscInt i, *diag, m = A->rmap->n; 793 MatScalar *val = a->val; 794 PetscScalar *idiag, *mdiag; 795 796 PetscFunctionBegin; 797 if (a->idiagvalid) PetscFunctionReturn(0); 798 PetscCall(MatMarkDiagonal_SeqSELL(A)); 799 diag = a->diag; 800 if (!a->idiag) { 801 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 802 PetscCall(PetscLogObjectMemory((PetscObject)A, 3 * m * sizeof(PetscScalar))); 803 val = a->val; 804 } 805 mdiag = a->mdiag; 806 idiag = a->idiag; 807 808 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 809 for (i = 0; i < m; i++) { 810 mdiag[i] = val[diag[i]]; 811 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 812 if (PetscRealPart(fshift)) { 813 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 814 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 815 A->factorerror_zeropivot_value = 0.0; 816 A->factorerror_zeropivot_row = i; 817 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 818 } 819 idiag[i] = 1.0 / val[diag[i]]; 820 } 821 PetscCall(PetscLogFlops(m)); 822 } else { 823 for (i = 0; i < m; i++) { 824 mdiag[i] = val[diag[i]]; 825 idiag[i] = omega / (fshift + val[diag[i]]); 826 } 827 PetscCall(PetscLogFlops(2.0 * m)); 828 } 829 a->idiagvalid = PETSC_TRUE; 830 PetscFunctionReturn(0); 831 } 832 833 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) { 834 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 835 836 PetscFunctionBegin; 837 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 838 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 839 PetscFunctionReturn(0); 840 } 841 842 PetscErrorCode MatDestroy_SeqSELL(Mat A) { 843 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 844 845 PetscFunctionBegin; 846 #if defined(PETSC_USE_LOG) 847 PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz); 848 #endif 849 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 850 PetscCall(ISDestroy(&a->row)); 851 PetscCall(ISDestroy(&a->col)); 852 PetscCall(PetscFree(a->diag)); 853 PetscCall(PetscFree(a->rlen)); 854 PetscCall(PetscFree(a->sliidx)); 855 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 856 PetscCall(PetscFree(a->solve_work)); 857 PetscCall(ISDestroy(&a->icol)); 858 PetscCall(PetscFree(a->saved_values)); 859 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 860 861 PetscCall(PetscFree(A->data)); 862 863 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 864 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 865 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 866 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 867 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 868 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 869 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 870 PetscFunctionReturn(0); 871 } 872 873 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) { 874 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 875 876 PetscFunctionBegin; 877 switch (op) { 878 case MAT_ROW_ORIENTED: a->roworiented = flg; break; 879 case MAT_KEEP_NONZERO_PATTERN: a->keepnonzeropattern = flg; break; 880 case MAT_NEW_NONZERO_LOCATIONS: a->nonew = (flg ? 0 : 1); break; 881 case MAT_NEW_NONZERO_LOCATION_ERR: a->nonew = (flg ? -1 : 0); break; 882 case MAT_NEW_NONZERO_ALLOCATION_ERR: a->nonew = (flg ? -2 : 0); break; 883 case MAT_UNUSED_NONZERO_LOCATION_ERR: a->nounused = (flg ? -1 : 0); break; 884 case MAT_FORCE_DIAGONAL_ENTRIES: 885 case MAT_IGNORE_OFF_PROC_ENTRIES: 886 case MAT_USE_HASH_TABLE: 887 case MAT_SORTED_FULL: PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); break; 888 case MAT_SPD: 889 case MAT_SYMMETRIC: 890 case MAT_STRUCTURALLY_SYMMETRIC: 891 case MAT_HERMITIAN: 892 case MAT_SYMMETRY_ETERNAL: 893 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 894 case MAT_SPD_ETERNAL: 895 /* These options are handled directly by MatSetOption() */ 896 break; 897 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 898 } 899 PetscFunctionReturn(0); 900 } 901 902 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) { 903 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 904 PetscInt i, j, n, shift; 905 PetscScalar *x, zero = 0.0; 906 907 PetscFunctionBegin; 908 PetscCall(VecGetLocalSize(v, &n)); 909 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 910 911 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 912 PetscInt *diag = a->diag; 913 PetscCall(VecGetArray(v, &x)); 914 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 915 PetscCall(VecRestoreArray(v, &x)); 916 PetscFunctionReturn(0); 917 } 918 919 PetscCall(VecSet(v, zero)); 920 PetscCall(VecGetArray(v, &x)); 921 for (i = 0; i < n; i++) { /* loop over rows */ 922 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 923 x[i] = 0; 924 for (j = 0; j < a->rlen[i]; j++) { 925 if (a->colidx[shift + j * 8] == i) { 926 x[i] = a->val[shift + j * 8]; 927 break; 928 } 929 } 930 } 931 PetscCall(VecRestoreArray(v, &x)); 932 PetscFunctionReturn(0); 933 } 934 935 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) { 936 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 937 const PetscScalar *l, *r; 938 PetscInt i, j, m, n, row; 939 940 PetscFunctionBegin; 941 if (ll) { 942 /* The local size is used so that VecMPI can be passed to this routine 943 by MatDiagonalScale_MPISELL */ 944 PetscCall(VecGetLocalSize(ll, &m)); 945 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 946 PetscCall(VecGetArrayRead(ll, &l)); 947 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 948 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 949 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 950 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row]; 951 } 952 } else { 953 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { a->val[j] *= l[8 * i + row]; } 954 } 955 } 956 PetscCall(VecRestoreArrayRead(ll, &l)); 957 PetscCall(PetscLogFlops(a->nz)); 958 } 959 if (rr) { 960 PetscCall(VecGetLocalSize(rr, &n)); 961 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 962 PetscCall(VecGetArrayRead(rr, &r)); 963 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 964 if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 965 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 966 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 967 } 968 } else { 969 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) { a->val[j] *= r[a->colidx[j]]; } 970 } 971 } 972 PetscCall(VecRestoreArrayRead(rr, &r)); 973 PetscCall(PetscLogFlops(a->nz)); 974 } 975 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 976 PetscFunctionReturn(0); 977 } 978 979 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) { 980 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 981 PetscInt *cp, i, k, low, high, t, row, col, l; 982 PetscInt shift; 983 MatScalar *vp; 984 985 PetscFunctionBegin; 986 for (k = 0; k < m; k++) { /* loop over requested rows */ 987 row = im[k]; 988 if (row < 0) continue; 989 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 990 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 991 cp = a->colidx + shift; /* pointer to the row */ 992 vp = a->val + shift; /* pointer to the row */ 993 for (l = 0; l < n; l++) { /* loop over requested columns */ 994 col = in[l]; 995 if (col < 0) continue; 996 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 997 high = a->rlen[row]; 998 low = 0; /* assume unsorted */ 999 while (high - low > 5) { 1000 t = (low + high) / 2; 1001 if (*(cp + t * 8) > col) high = t; 1002 else low = t; 1003 } 1004 for (i = low; i < high; i++) { 1005 if (*(cp + 8 * i) > col) break; 1006 if (*(cp + 8 * i) == col) { 1007 *v++ = *(vp + 8 * i); 1008 goto finished; 1009 } 1010 } 1011 *v++ = 0.0; 1012 finished:; 1013 } 1014 } 1015 PetscFunctionReturn(0); 1016 } 1017 1018 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) { 1019 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1020 PetscInt i, j, m = A->rmap->n, shift; 1021 const char *name; 1022 PetscViewerFormat format; 1023 1024 PetscFunctionBegin; 1025 PetscCall(PetscViewerGetFormat(viewer, &format)); 1026 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1027 PetscInt nofinalvalue = 0; 1028 /* 1029 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1030 nofinalvalue = 1; 1031 } 1032 */ 1033 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1034 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1035 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1036 #if defined(PETSC_USE_COMPLEX) 1037 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1038 #else 1039 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1040 #endif 1041 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1042 1043 for (i = 0; i < m; i++) { 1044 shift = a->sliidx[i >> 3] + (i & 0x07); 1045 for (j = 0; j < a->rlen[i]; j++) { 1046 #if defined(PETSC_USE_COMPLEX) 1047 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1048 #else 1049 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j])); 1050 #endif 1051 } 1052 } 1053 /* 1054 if (nofinalvalue) { 1055 #if defined(PETSC_USE_COMPLEX) 1056 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1057 #else 1058 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1059 #endif 1060 } 1061 */ 1062 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1063 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1064 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1065 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1066 PetscFunctionReturn(0); 1067 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1068 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1069 for (i = 0; i < m; i++) { 1070 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1071 shift = a->sliidx[i >> 3] + (i & 0x07); 1072 for (j = 0; j < a->rlen[i]; j++) { 1073 #if defined(PETSC_USE_COMPLEX) 1074 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1075 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1076 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1077 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1078 } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) { 1079 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1080 } 1081 #else 1082 if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1083 #endif 1084 } 1085 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1086 } 1087 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1088 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1089 PetscInt cnt = 0, jcnt; 1090 PetscScalar value; 1091 #if defined(PETSC_USE_COMPLEX) 1092 PetscBool realonly = PETSC_TRUE; 1093 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1094 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1095 realonly = PETSC_FALSE; 1096 break; 1097 } 1098 } 1099 #endif 1100 1101 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1102 for (i = 0; i < m; i++) { 1103 jcnt = 0; 1104 shift = a->sliidx[i >> 3] + (i & 0x07); 1105 for (j = 0; j < A->cmap->n; j++) { 1106 if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) { 1107 value = a->val[cnt++]; 1108 jcnt++; 1109 } else { 1110 value = 0.0; 1111 } 1112 #if defined(PETSC_USE_COMPLEX) 1113 if (realonly) { 1114 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1115 } else { 1116 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1117 } 1118 #else 1119 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1120 #endif 1121 } 1122 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1123 } 1124 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1125 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1126 PetscInt fshift = 1; 1127 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1128 #if defined(PETSC_USE_COMPLEX) 1129 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1130 #else 1131 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1132 #endif 1133 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1134 for (i = 0; i < m; i++) { 1135 shift = a->sliidx[i >> 3] + (i & 0x07); 1136 for (j = 0; j < a->rlen[i]; j++) { 1137 #if defined(PETSC_USE_COMPLEX) 1138 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1139 #else 1140 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j])); 1141 #endif 1142 } 1143 } 1144 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1145 } else if (format == PETSC_VIEWER_NATIVE) { 1146 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1147 PetscInt row; 1148 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1149 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) { 1150 #if defined(PETSC_USE_COMPLEX) 1151 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1152 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1153 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1154 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1155 } else { 1156 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1157 } 1158 #else 1159 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j])); 1160 #endif 1161 } 1162 } 1163 } else { 1164 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1165 if (A->factortype) { 1166 for (i = 0; i < m; i++) { 1167 shift = a->sliidx[i >> 3] + (i & 0x07); 1168 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1169 /* L part */ 1170 for (j = shift; j < a->diag[i]; j += 8) { 1171 #if defined(PETSC_USE_COMPLEX) 1172 if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) { 1173 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1174 } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) { 1175 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1176 } else { 1177 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1178 } 1179 #else 1180 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1181 #endif 1182 } 1183 /* diagonal */ 1184 j = a->diag[i]; 1185 #if defined(PETSC_USE_COMPLEX) 1186 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1187 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1188 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1189 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1190 } else { 1191 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1192 } 1193 #else 1194 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1195 #endif 1196 1197 /* U part */ 1198 for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) { 1199 #if defined(PETSC_USE_COMPLEX) 1200 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1201 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1202 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1203 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1204 } else { 1205 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1206 } 1207 #else 1208 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1209 #endif 1210 } 1211 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1212 } 1213 } else { 1214 for (i = 0; i < m; i++) { 1215 shift = a->sliidx[i >> 3] + (i & 0x07); 1216 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1217 for (j = 0; j < a->rlen[i]; j++) { 1218 #if defined(PETSC_USE_COMPLEX) 1219 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1220 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]))); 1221 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1222 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]))); 1223 } else { 1224 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]))); 1225 } 1226 #else 1227 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j])); 1228 #endif 1229 } 1230 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1231 } 1232 } 1233 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1234 } 1235 PetscCall(PetscViewerFlush(viewer)); 1236 PetscFunctionReturn(0); 1237 } 1238 1239 #include <petscdraw.h> 1240 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) { 1241 Mat A = (Mat)Aa; 1242 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1243 PetscInt i, j, m = A->rmap->n, shift; 1244 int color; 1245 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1246 PetscViewer viewer; 1247 PetscViewerFormat format; 1248 1249 PetscFunctionBegin; 1250 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1251 PetscCall(PetscViewerGetFormat(viewer, &format)); 1252 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1253 1254 /* loop over matrix elements drawing boxes */ 1255 1256 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1257 PetscDrawCollectiveBegin(draw); 1258 /* Blue for negative, Cyan for zero and Red for positive */ 1259 color = PETSC_DRAW_BLUE; 1260 for (i = 0; i < m; i++) { 1261 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1262 y_l = m - i - 1.0; 1263 y_r = y_l + 1.0; 1264 for (j = 0; j < a->rlen[i]; j++) { 1265 x_l = a->colidx[shift + j * 8]; 1266 x_r = x_l + 1.0; 1267 if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue; 1268 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1269 } 1270 } 1271 color = PETSC_DRAW_CYAN; 1272 for (i = 0; i < m; i++) { 1273 shift = a->sliidx[i >> 3] + (i & 0x07); 1274 y_l = m - i - 1.0; 1275 y_r = y_l + 1.0; 1276 for (j = 0; j < a->rlen[i]; j++) { 1277 x_l = a->colidx[shift + j * 8]; 1278 x_r = x_l + 1.0; 1279 if (a->val[shift + 8 * j] != 0.) continue; 1280 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1281 } 1282 } 1283 color = PETSC_DRAW_RED; 1284 for (i = 0; i < m; i++) { 1285 shift = a->sliidx[i >> 3] + (i & 0x07); 1286 y_l = m - i - 1.0; 1287 y_r = y_l + 1.0; 1288 for (j = 0; j < a->rlen[i]; j++) { 1289 x_l = a->colidx[shift + j * 8]; 1290 x_r = x_l + 1.0; 1291 if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue; 1292 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1293 } 1294 } 1295 PetscDrawCollectiveEnd(draw); 1296 } else { 1297 /* use contour shading to indicate magnitude of values */ 1298 /* first determine max of all nonzero values */ 1299 PetscReal minv = 0.0, maxv = 0.0; 1300 PetscInt count = 0; 1301 PetscDraw popup; 1302 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1303 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1304 } 1305 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1306 PetscCall(PetscDrawGetPopup(draw, &popup)); 1307 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1308 1309 PetscDrawCollectiveBegin(draw); 1310 for (i = 0; i < m; i++) { 1311 shift = a->sliidx[i >> 3] + (i & 0x07); 1312 y_l = m - i - 1.0; 1313 y_r = y_l + 1.0; 1314 for (j = 0; j < a->rlen[i]; j++) { 1315 x_l = a->colidx[shift + j * 8]; 1316 x_r = x_l + 1.0; 1317 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1318 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1319 count++; 1320 } 1321 } 1322 PetscDrawCollectiveEnd(draw); 1323 } 1324 PetscFunctionReturn(0); 1325 } 1326 1327 #include <petscdraw.h> 1328 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) { 1329 PetscDraw draw; 1330 PetscReal xr, yr, xl, yl, h, w; 1331 PetscBool isnull; 1332 1333 PetscFunctionBegin; 1334 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1335 PetscCall(PetscDrawIsNull(draw, &isnull)); 1336 if (isnull) PetscFunctionReturn(0); 1337 1338 xr = A->cmap->n; 1339 yr = A->rmap->n; 1340 h = yr / 10.0; 1341 w = xr / 10.0; 1342 xr += w; 1343 yr += h; 1344 xl = -w; 1345 yl = -h; 1346 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1347 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1348 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1349 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1350 PetscCall(PetscDrawSave(draw)); 1351 PetscFunctionReturn(0); 1352 } 1353 1354 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) { 1355 PetscBool iascii, isbinary, isdraw; 1356 1357 PetscFunctionBegin; 1358 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1359 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1360 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1361 if (iascii) { 1362 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1363 } else if (isbinary) { 1364 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1365 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1366 PetscFunctionReturn(0); 1367 } 1368 1369 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) { 1370 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1371 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1372 MatScalar *vp; 1373 1374 PetscFunctionBegin; 1375 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1376 /* To do: compress out the unused elements */ 1377 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1378 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1379 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1380 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1381 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1382 for (i = 0; i < a->totalslices; ++i) { 1383 shift = a->sliidx[i]; /* starting index of the slice */ 1384 cp = a->colidx + shift; /* pointer to the column indices of the slice */ 1385 vp = a->val + shift; /* pointer to the nonzero values of the slice */ 1386 for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */ 1387 row = 8 * i + row_in_slice; 1388 nrow = a->rlen[row]; /* number of nonzeros in row */ 1389 /* 1390 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1391 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1392 */ 1393 lastcol = 0; 1394 if (nrow > 0) { /* nonempty row */ 1395 lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1396 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1397 for (j = 1; j < 8; j++) { 1398 if (a->rlen[8 * i + j]) { 1399 lastcol = cp[j]; 1400 break; 1401 } 1402 } 1403 } else { 1404 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1405 } 1406 1407 for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) { 1408 cp[8 * k + row_in_slice] = lastcol; 1409 vp[8 * k + row_in_slice] = (MatScalar)0; 1410 } 1411 } 1412 } 1413 1414 A->info.mallocs += a->reallocs; 1415 a->reallocs = 0; 1416 1417 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1418 PetscFunctionReturn(0); 1419 } 1420 1421 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) { 1422 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1423 1424 PetscFunctionBegin; 1425 info->block_size = 1.0; 1426 info->nz_allocated = a->maxallocmat; 1427 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1428 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1429 info->assemblies = A->num_ass; 1430 info->mallocs = A->info.mallocs; 1431 info->memory = ((PetscObject)A)->mem; 1432 if (A->factortype) { 1433 info->fill_ratio_given = A->info.fill_ratio_given; 1434 info->fill_ratio_needed = A->info.fill_ratio_needed; 1435 info->factor_mallocs = A->info.factor_mallocs; 1436 } else { 1437 info->fill_ratio_given = 0; 1438 info->fill_ratio_needed = 0; 1439 info->factor_mallocs = 0; 1440 } 1441 PetscFunctionReturn(0); 1442 } 1443 1444 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) { 1445 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1446 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1447 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1448 MatScalar *vp, value; 1449 1450 PetscFunctionBegin; 1451 for (k = 0; k < m; k++) { /* loop over added rows */ 1452 row = im[k]; 1453 if (row < 0) continue; 1454 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1455 shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */ 1456 cp = a->colidx + shift; /* pointer to the row */ 1457 vp = a->val + shift; /* pointer to the row */ 1458 nrow = a->rlen[row]; 1459 low = 0; 1460 high = nrow; 1461 1462 for (l = 0; l < n; l++) { /* loop over added columns */ 1463 col = in[l]; 1464 if (col < 0) continue; 1465 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1466 if (a->roworiented) { 1467 value = v[l + k * n]; 1468 } else { 1469 value = v[k + l * m]; 1470 } 1471 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1472 1473 /* search in this row for the specified column, i indicates the column to be set */ 1474 if (col <= lastcol) low = 0; 1475 else high = nrow; 1476 lastcol = col; 1477 while (high - low > 5) { 1478 t = (low + high) / 2; 1479 if (*(cp + t * 8) > col) high = t; 1480 else low = t; 1481 } 1482 for (i = low; i < high; i++) { 1483 if (*(cp + i * 8) > col) break; 1484 if (*(cp + i * 8) == col) { 1485 if (is == ADD_VALUES) *(vp + i * 8) += value; 1486 else *(vp + i * 8) = value; 1487 low = i + 1; 1488 goto noinsert; 1489 } 1490 } 1491 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1492 if (nonew == 1) goto noinsert; 1493 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1494 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1495 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar); 1496 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1497 for (ii = nrow - 1; ii >= i; ii--) { 1498 *(cp + (ii + 1) * 8) = *(cp + ii * 8); 1499 *(vp + (ii + 1) * 8) = *(vp + ii * 8); 1500 } 1501 a->rlen[row]++; 1502 *(cp + i * 8) = col; 1503 *(vp + i * 8) = value; 1504 a->nz++; 1505 A->nonzerostate++; 1506 low = i + 1; 1507 high++; 1508 nrow++; 1509 noinsert:; 1510 } 1511 a->rlen[row] = nrow; 1512 } 1513 PetscFunctionReturn(0); 1514 } 1515 1516 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) { 1517 PetscFunctionBegin; 1518 /* If the two matrices have the same copy implementation, use fast copy. */ 1519 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1520 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1521 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1522 1523 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1524 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1525 } else { 1526 PetscCall(MatCopy_Basic(A, B, str)); 1527 } 1528 PetscFunctionReturn(0); 1529 } 1530 1531 PetscErrorCode MatSetUp_SeqSELL(Mat A) { 1532 PetscFunctionBegin; 1533 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1534 PetscFunctionReturn(0); 1535 } 1536 1537 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) { 1538 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1539 1540 PetscFunctionBegin; 1541 *array = a->val; 1542 PetscFunctionReturn(0); 1543 } 1544 1545 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) { 1546 PetscFunctionBegin; 1547 PetscFunctionReturn(0); 1548 } 1549 1550 PetscErrorCode MatRealPart_SeqSELL(Mat A) { 1551 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1552 PetscInt i; 1553 MatScalar *aval = a->val; 1554 1555 PetscFunctionBegin; 1556 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]); 1557 PetscFunctionReturn(0); 1558 } 1559 1560 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) { 1561 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1562 PetscInt i; 1563 MatScalar *aval = a->val; 1564 1565 PetscFunctionBegin; 1566 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1567 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1568 PetscFunctionReturn(0); 1569 } 1570 1571 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) { 1572 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1573 MatScalar *aval = a->val; 1574 PetscScalar oalpha = alpha; 1575 PetscBLASInt one = 1, size; 1576 1577 PetscFunctionBegin; 1578 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1579 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1580 PetscCall(PetscLogFlops(a->nz)); 1581 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1582 PetscFunctionReturn(0); 1583 } 1584 1585 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) { 1586 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1587 1588 PetscFunctionBegin; 1589 if (!Y->preallocated || !y->nz) { PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); } 1590 PetscCall(MatShift_Basic(Y, a)); 1591 PetscFunctionReturn(0); 1592 } 1593 1594 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) { 1595 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1596 PetscScalar *x, sum, *t; 1597 const MatScalar *idiag = NULL, *mdiag; 1598 const PetscScalar *b, *xb; 1599 PetscInt n, m = A->rmap->n, i, j, shift; 1600 const PetscInt *diag; 1601 1602 PetscFunctionBegin; 1603 its = its * lits; 1604 1605 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1606 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1607 a->fshift = fshift; 1608 a->omega = omega; 1609 1610 diag = a->diag; 1611 t = a->ssor_work; 1612 idiag = a->idiag; 1613 mdiag = a->mdiag; 1614 1615 PetscCall(VecGetArray(xx, &x)); 1616 PetscCall(VecGetArrayRead(bb, &b)); 1617 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1618 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1619 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1620 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1621 1622 if (flag & SOR_ZERO_INITIAL_GUESS) { 1623 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1624 for (i = 0; i < m; i++) { 1625 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1626 sum = b[i]; 1627 n = (diag[i] - shift) / 8; 1628 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1629 t[i] = sum; 1630 x[i] = sum * idiag[i]; 1631 } 1632 xb = t; 1633 PetscCall(PetscLogFlops(a->nz)); 1634 } else xb = b; 1635 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1636 for (i = m - 1; i >= 0; i--) { 1637 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1638 sum = xb[i]; 1639 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1640 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1641 if (xb == b) { 1642 x[i] = sum * idiag[i]; 1643 } else { 1644 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1645 } 1646 } 1647 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1648 } 1649 its--; 1650 } 1651 while (its--) { 1652 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1653 for (i = 0; i < m; i++) { 1654 /* lower */ 1655 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1656 sum = b[i]; 1657 n = (diag[i] - shift) / 8; 1658 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1659 t[i] = sum; /* save application of the lower-triangular part */ 1660 /* upper */ 1661 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1662 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1663 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1664 } 1665 xb = t; 1666 PetscCall(PetscLogFlops(2.0 * a->nz)); 1667 } else xb = b; 1668 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1669 for (i = m - 1; i >= 0; i--) { 1670 shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */ 1671 sum = xb[i]; 1672 if (xb == b) { 1673 /* whole matrix (no checkpointing available) */ 1674 n = a->rlen[i]; 1675 for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]]; 1676 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1677 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1678 n = a->rlen[i] - (diag[i] - shift) / 8 - 1; 1679 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]]; 1680 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1681 } 1682 } 1683 if (xb == b) { 1684 PetscCall(PetscLogFlops(2.0 * a->nz)); 1685 } else { 1686 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1687 } 1688 } 1689 } 1690 PetscCall(VecRestoreArray(xx, &x)); 1691 PetscCall(VecRestoreArrayRead(bb, &b)); 1692 PetscFunctionReturn(0); 1693 } 1694 1695 /* -------------------------------------------------------------------*/ 1696 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1697 MatGetRow_SeqSELL, 1698 MatRestoreRow_SeqSELL, 1699 MatMult_SeqSELL, 1700 /* 4*/ MatMultAdd_SeqSELL, 1701 MatMultTranspose_SeqSELL, 1702 MatMultTransposeAdd_SeqSELL, 1703 NULL, 1704 NULL, 1705 NULL, 1706 /* 10*/ NULL, 1707 NULL, 1708 NULL, 1709 MatSOR_SeqSELL, 1710 NULL, 1711 /* 15*/ MatGetInfo_SeqSELL, 1712 MatEqual_SeqSELL, 1713 MatGetDiagonal_SeqSELL, 1714 MatDiagonalScale_SeqSELL, 1715 NULL, 1716 /* 20*/ NULL, 1717 MatAssemblyEnd_SeqSELL, 1718 MatSetOption_SeqSELL, 1719 MatZeroEntries_SeqSELL, 1720 /* 24*/ NULL, 1721 NULL, 1722 NULL, 1723 NULL, 1724 NULL, 1725 /* 29*/ MatSetUp_SeqSELL, 1726 NULL, 1727 NULL, 1728 NULL, 1729 NULL, 1730 /* 34*/ MatDuplicate_SeqSELL, 1731 NULL, 1732 NULL, 1733 NULL, 1734 NULL, 1735 /* 39*/ NULL, 1736 NULL, 1737 NULL, 1738 MatGetValues_SeqSELL, 1739 MatCopy_SeqSELL, 1740 /* 44*/ NULL, 1741 MatScale_SeqSELL, 1742 MatShift_SeqSELL, 1743 NULL, 1744 NULL, 1745 /* 49*/ NULL, 1746 NULL, 1747 NULL, 1748 NULL, 1749 NULL, 1750 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1751 NULL, 1752 NULL, 1753 NULL, 1754 NULL, 1755 /* 59*/ NULL, 1756 MatDestroy_SeqSELL, 1757 MatView_SeqSELL, 1758 NULL, 1759 NULL, 1760 /* 64*/ NULL, 1761 NULL, 1762 NULL, 1763 NULL, 1764 NULL, 1765 /* 69*/ NULL, 1766 NULL, 1767 NULL, 1768 NULL, 1769 NULL, 1770 /* 74*/ NULL, 1771 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1772 NULL, 1773 NULL, 1774 NULL, 1775 /* 79*/ NULL, 1776 NULL, 1777 NULL, 1778 NULL, 1779 NULL, 1780 /* 84*/ NULL, 1781 NULL, 1782 NULL, 1783 NULL, 1784 NULL, 1785 /* 89*/ NULL, 1786 NULL, 1787 NULL, 1788 NULL, 1789 NULL, 1790 /* 94*/ NULL, 1791 NULL, 1792 NULL, 1793 NULL, 1794 NULL, 1795 /* 99*/ NULL, 1796 NULL, 1797 NULL, 1798 MatConjugate_SeqSELL, 1799 NULL, 1800 /*104*/ NULL, 1801 NULL, 1802 NULL, 1803 NULL, 1804 NULL, 1805 /*109*/ NULL, 1806 NULL, 1807 NULL, 1808 NULL, 1809 MatMissingDiagonal_SeqSELL, 1810 /*114*/ NULL, 1811 NULL, 1812 NULL, 1813 NULL, 1814 NULL, 1815 /*119*/ NULL, 1816 NULL, 1817 NULL, 1818 NULL, 1819 NULL, 1820 /*124*/ NULL, 1821 NULL, 1822 NULL, 1823 NULL, 1824 NULL, 1825 /*129*/ NULL, 1826 NULL, 1827 NULL, 1828 NULL, 1829 NULL, 1830 /*134*/ NULL, 1831 NULL, 1832 NULL, 1833 NULL, 1834 NULL, 1835 /*139*/ NULL, 1836 NULL, 1837 NULL, 1838 MatFDColoringSetUp_SeqXAIJ, 1839 NULL, 1840 /*144*/ NULL, 1841 NULL, 1842 NULL, 1843 NULL, 1844 NULL, 1845 NULL, 1846 /*150*/ NULL}; 1847 1848 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) { 1849 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1850 1851 PetscFunctionBegin; 1852 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1853 1854 /* allocate space for values if not already there */ 1855 if (!a->saved_values) { 1856 PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1857 PetscCall(PetscLogObjectMemory((PetscObject)mat, (a->sliidx[a->totalslices] + 1) * sizeof(PetscScalar))); 1858 } 1859 1860 /* copy values over */ 1861 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1862 PetscFunctionReturn(0); 1863 } 1864 1865 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) { 1866 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1867 1868 PetscFunctionBegin; 1869 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1870 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1871 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1872 PetscFunctionReturn(0); 1873 } 1874 1875 /*@C 1876 MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray() 1877 1878 Not Collective 1879 1880 Input Parameters: 1881 . mat - a MATSEQSELL matrix 1882 . array - pointer to the data 1883 1884 Level: intermediate 1885 1886 .seealso: `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 1887 @*/ 1888 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) { 1889 PetscFunctionBegin; 1890 PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array)); 1891 PetscFunctionReturn(0); 1892 } 1893 1894 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) { 1895 Mat_SeqSELL *b; 1896 PetscMPIInt size; 1897 1898 PetscFunctionBegin; 1899 PetscCall(PetscCitationsRegister(citation, &cited)); 1900 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 1901 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 1902 1903 PetscCall(PetscNewLog(B, &b)); 1904 1905 B->data = (void *)b; 1906 1907 PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps))); 1908 1909 b->row = NULL; 1910 b->col = NULL; 1911 b->icol = NULL; 1912 b->reallocs = 0; 1913 b->ignorezeroentries = PETSC_FALSE; 1914 b->roworiented = PETSC_TRUE; 1915 b->nonew = 0; 1916 b->diag = NULL; 1917 b->solve_work = NULL; 1918 B->spptr = NULL; 1919 b->saved_values = NULL; 1920 b->idiag = NULL; 1921 b->mdiag = NULL; 1922 b->ssor_work = NULL; 1923 b->omega = 1.0; 1924 b->fshift = 0.0; 1925 b->idiagvalid = PETSC_FALSE; 1926 b->keepnonzeropattern = PETSC_FALSE; 1927 1928 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 1929 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 1930 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 1931 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 1932 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 1933 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 1934 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 1935 PetscFunctionReturn(0); 1936 } 1937 1938 /* 1939 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1940 */ 1941 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) { 1942 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 1943 PetscInt i, m = A->rmap->n; 1944 PetscInt totalslices = a->totalslices; 1945 1946 PetscFunctionBegin; 1947 C->factortype = A->factortype; 1948 c->row = NULL; 1949 c->col = NULL; 1950 c->icol = NULL; 1951 c->reallocs = 0; 1952 C->assembled = PETSC_TRUE; 1953 1954 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 1955 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 1956 1957 PetscCall(PetscMalloc1(8 * totalslices, &c->rlen)); 1958 PetscCall(PetscLogObjectMemory((PetscObject)C, m * sizeof(PetscInt))); 1959 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 1960 PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices + 1) * sizeof(PetscInt))); 1961 1962 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 1963 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 1964 1965 /* allocate the matrix space */ 1966 if (mallocmatspace) { 1967 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 1968 PetscCall(PetscLogObjectMemory((PetscObject)C, a->maxallocmat * (sizeof(PetscScalar) + sizeof(PetscInt)))); 1969 1970 c->singlemalloc = PETSC_TRUE; 1971 1972 if (m > 0) { 1973 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 1974 if (cpvalues == MAT_COPY_VALUES) { 1975 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 1976 } else { 1977 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 1978 } 1979 } 1980 } 1981 1982 c->ignorezeroentries = a->ignorezeroentries; 1983 c->roworiented = a->roworiented; 1984 c->nonew = a->nonew; 1985 if (a->diag) { 1986 PetscCall(PetscMalloc1(m, &c->diag)); 1987 PetscCall(PetscLogObjectMemory((PetscObject)C, m * sizeof(PetscInt))); 1988 for (i = 0; i < m; i++) { c->diag[i] = a->diag[i]; } 1989 } else c->diag = NULL; 1990 1991 c->solve_work = NULL; 1992 c->saved_values = NULL; 1993 c->idiag = NULL; 1994 c->ssor_work = NULL; 1995 c->keepnonzeropattern = a->keepnonzeropattern; 1996 c->free_val = PETSC_TRUE; 1997 c->free_colidx = PETSC_TRUE; 1998 1999 c->maxallocmat = a->maxallocmat; 2000 c->maxallocrow = a->maxallocrow; 2001 c->rlenmax = a->rlenmax; 2002 c->nz = a->nz; 2003 C->preallocated = PETSC_TRUE; 2004 2005 c->nonzerorowcnt = a->nonzerorowcnt; 2006 C->nonzerostate = A->nonzerostate; 2007 2008 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2009 PetscFunctionReturn(0); 2010 } 2011 2012 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) { 2013 PetscFunctionBegin; 2014 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2015 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2016 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) { PetscCall(MatSetBlockSizesFromMats(*B, A, A)); } 2017 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2018 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2019 PetscFunctionReturn(0); 2020 } 2021 2022 /*MC 2023 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2024 based on the sliced Ellpack format 2025 2026 Options Database Keys: 2027 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions() 2028 2029 Level: beginner 2030 2031 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2032 M*/ 2033 2034 /*MC 2035 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2036 2037 This matrix type is identical to MATSEQSELL when constructed with a single process communicator, 2038 and MATMPISELL otherwise. As a result, for single process communicators, 2039 MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported 2040 for communicators controlling multiple processes. It is recommended that you call both of 2041 the above preallocation routines for simplicity. 2042 2043 Options Database Keys: 2044 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2045 2046 Level: beginner 2047 2048 Notes: 2049 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2050 2051 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2052 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2053 2054 Developer Notes: 2055 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2056 2057 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2058 .vb 2059 (2 0 3 4) 2060 Consider the matrix A = (5 0 6 0) 2061 (0 0 7 8) 2062 (0 0 9 9) 2063 2064 symbolically the Ellpack format can be written as 2065 2066 (2 3 4 |) (0 2 3 |) 2067 v = (5 6 0 |) colidx = (0 2 2 |) 2068 -------- --------- 2069 (7 8 |) (2 3 |) 2070 (9 9 |) (2 3 |) 2071 2072 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2073 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2074 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2075 2076 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2077 2078 .ve 2079 2080 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2081 2082 References: 2083 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2084 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2085 2086 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2087 M*/ 2088 2089 /*@C 2090 MatCreateSeqSELL - Creates a sparse matrix in SELL format. 2091 2092 Collective on comm 2093 2094 Input Parameters: 2095 + comm - MPI communicator, set to PETSC_COMM_SELF 2096 . m - number of rows 2097 . n - number of columns 2098 . rlenmax - maximum number of nonzeros in a row 2099 - rlen - array containing the number of nonzeros in the various rows 2100 (possibly different for each row) or NULL 2101 2102 Output Parameter: 2103 . A - the matrix 2104 2105 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2106 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2107 [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation] 2108 2109 Notes: 2110 If nnz is given then nz is ignored 2111 2112 Specify the preallocated storage with either rlenmax or rlen (not both). 2113 Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory 2114 allocation. For large problems you MUST preallocate memory or you 2115 will get TERRIBLE performance, see the users' manual chapter on matrices. 2116 2117 Level: intermediate 2118 2119 .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2120 2121 @*/ 2122 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A) { 2123 PetscFunctionBegin; 2124 PetscCall(MatCreate(comm, A)); 2125 PetscCall(MatSetSizes(*A, m, n, m, n)); 2126 PetscCall(MatSetType(*A, MATSEQSELL)); 2127 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen)); 2128 PetscFunctionReturn(0); 2129 } 2130 2131 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) { 2132 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2133 PetscInt totalslices = a->totalslices; 2134 2135 PetscFunctionBegin; 2136 /* If the matrix dimensions are not equal,or no of nonzeros */ 2137 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2138 *flg = PETSC_FALSE; 2139 PetscFunctionReturn(0); 2140 } 2141 /* if the a->colidx are the same */ 2142 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2143 if (!*flg) PetscFunctionReturn(0); 2144 /* if a->val are the same */ 2145 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2146 PetscFunctionReturn(0); 2147 } 2148 2149 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) { 2150 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2151 2152 PetscFunctionBegin; 2153 a->idiagvalid = PETSC_FALSE; 2154 PetscFunctionReturn(0); 2155 } 2156 2157 PetscErrorCode MatConjugate_SeqSELL(Mat A) { 2158 #if defined(PETSC_USE_COMPLEX) 2159 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2160 PetscInt i; 2161 PetscScalar *val = a->val; 2162 2163 PetscFunctionBegin; 2164 for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); } 2165 #else 2166 PetscFunctionBegin; 2167 #endif 2168 PetscFunctionReturn(0); 2169 } 2170