1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 11 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 12 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 13 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 14 " year = 2018\n" 15 "}\n"; 16 17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 18 19 #include <immintrin.h> 20 21 #if !defined(_MM_SCALE_8) 22 #define _MM_SCALE_8 8 23 #endif 24 25 #if defined(__AVX512F__) 26 /* these do not work 27 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 28 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 29 */ 30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 31 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 32 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 33 vec_vals = _mm512_loadu_pd(aval); \ 34 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 35 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 36 #elif defined(__AVX2__) && defined(__FMA__) 37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 38 vec_vals = _mm256_loadu_pd(aval); \ 39 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 40 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 41 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 42 #endif 43 #endif /* PETSC_HAVE_IMMINTRIN_H */ 44 45 /*@C 46 MatSeqSELLSetPreallocation - For good matrix assembly performance 47 the user should preallocate the matrix storage by setting the parameter `nz` 48 (or the array `nnz`). 49 50 Collective 51 52 Input Parameters: 53 + B - The `MATSEQSELL` matrix 54 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided 55 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 56 57 Level: intermediate 58 59 Notes: 60 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 61 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 62 allocation. 63 64 You can call `MatGetInfo()` to get information on how effective the preallocation was; 65 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 66 You can also run with the option `-info` and look for messages with the string 67 malloc in them to see if additional memory allocation was needed. 68 69 Developer Notes: 70 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 71 entries or columns indices. 72 73 The maximum number of nonzeos in any row should be as accurate as possible. 74 If it is underestimated, you will get bad performance due to reallocation 75 (`MatSeqXSELLReallocateSELL()`). 76 77 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 78 @*/ 79 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 80 { 81 PetscFunctionBegin; 82 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 83 PetscValidType(B, 1); 84 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 85 PetscFunctionReturn(PETSC_SUCCESS); 86 } 87 88 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 89 { 90 Mat_SeqSELL *b; 91 PetscInt i, j, totalslices; 92 #if defined(PETSC_HAVE_CUDA) 93 PetscInt rlenmax = 0; 94 #endif 95 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 96 97 PetscFunctionBegin; 98 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 99 if (maxallocrow == MAT_SKIP_ALLOCATION) { 100 skipallocation = PETSC_TRUE; 101 maxallocrow = 0; 102 } 103 104 PetscCall(PetscLayoutSetUp(B->rmap)); 105 PetscCall(PetscLayoutSetUp(B->cmap)); 106 107 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 108 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 109 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 110 if (rlen) { 111 for (i = 0; i < B->rmap->n; i++) { 112 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 113 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 114 } 115 } 116 117 B->preallocated = PETSC_TRUE; 118 119 b = (Mat_SeqSELL *)B->data; 120 121 if (!b->sliceheight) { /* not set yet */ 122 #if defined(PETSC_HAVE_CUDA) 123 b->sliceheight = 16; 124 #else 125 b->sliceheight = 8; 126 #endif 127 } 128 totalslices = PetscCeilInt(B->rmap->n, b->sliceheight); 129 b->totalslices = totalslices; 130 if (!skipallocation) { 131 if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n)); 132 133 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 134 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 135 } 136 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 137 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 138 else if (maxallocrow < 0) maxallocrow = 1; 139 #if defined(PETSC_HAVE_CUDA) 140 rlenmax = maxallocrow; 141 /* Pad the slice to DEVICE_MEM_ALIGN */ 142 while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++; 143 #endif 144 for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow; 145 } else { 146 #if defined(PETSC_HAVE_CUDA) 147 PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight; 148 #endif 149 maxallocrow = 0; 150 b->sliidx[0] = 0; 151 for (i = 1; i < totalslices; i++) { 152 b->sliidx[i] = 0; 153 for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); } 154 #if defined(PETSC_HAVE_CUDA) 155 rlenmax = PetscMax(b->sliidx[i], rlenmax); 156 /* Pad the slice to DEVICE_MEM_ALIGN */ 157 b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul; 158 #endif 159 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 160 PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i])); 161 } 162 /* last slice */ 163 b->sliidx[totalslices] = 0; 164 for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 165 #if defined(PETSC_HAVE_CUDA) 166 rlenmax = PetscMax(b->sliidx[i], rlenmax); 167 b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul; 168 #endif 169 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 170 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices]; 171 } 172 173 /* allocate space for val, colidx, rlen */ 174 /* FIXME: should B's old memory be unlogged? */ 175 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 176 /* FIXME: assuming an element of the bit array takes 8 bits */ 177 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 178 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 179 PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen)); 180 181 b->singlemalloc = PETSC_TRUE; 182 b->free_val = PETSC_TRUE; 183 b->free_colidx = PETSC_TRUE; 184 } else { 185 b->free_val = PETSC_FALSE; 186 b->free_colidx = PETSC_FALSE; 187 } 188 189 b->nz = 0; 190 b->maxallocrow = maxallocrow; 191 #if defined(PETSC_HAVE_CUDA) 192 b->rlenmax = rlenmax; 193 #else 194 b->rlenmax = maxallocrow; 195 #endif 196 b->maxallocmat = b->sliidx[totalslices]; 197 B->info.nz_unneeded = (double)b->maxallocmat; 198 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 199 PetscFunctionReturn(PETSC_SUCCESS); 200 } 201 202 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 203 { 204 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 205 PetscInt shift; 206 207 PetscFunctionBegin; 208 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 209 if (nz) *nz = a->rlen[row]; 210 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); 211 if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); } 212 if (idx) { 213 PetscInt j; 214 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j]; 215 *idx = a->getrowcols; 216 } 217 if (v) { 218 PetscInt j; 219 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j]; 220 *v = a->getrowvals; 221 } 222 PetscFunctionReturn(PETSC_SUCCESS); 223 } 224 225 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 226 { 227 PetscFunctionBegin; 228 PetscFunctionReturn(PETSC_SUCCESS); 229 } 230 231 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 232 { 233 Mat B; 234 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 235 PetscInt i; 236 237 PetscFunctionBegin; 238 if (reuse == MAT_REUSE_MATRIX) { 239 B = *newmat; 240 PetscCall(MatZeroEntries(B)); 241 } else { 242 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 243 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 244 PetscCall(MatSetType(B, MATSEQAIJ)); 245 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 246 } 247 248 for (i = 0; i < A->rmap->n; i++) { 249 PetscInt nz = 0, *cols = NULL; 250 PetscScalar *vals = NULL; 251 252 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 253 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 254 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 255 } 256 257 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 258 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 259 B->rmap->bs = A->rmap->bs; 260 261 if (reuse == MAT_INPLACE_MATRIX) { 262 PetscCall(MatHeaderReplace(A, &B)); 263 } else { 264 *newmat = B; 265 } 266 PetscFunctionReturn(PETSC_SUCCESS); 267 } 268 269 #include <../src/mat/impls/aij/seq/aij.h> 270 271 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 272 { 273 Mat B; 274 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 275 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 276 const PetscInt *cols; 277 const PetscScalar *vals; 278 279 PetscFunctionBegin; 280 281 if (reuse == MAT_REUSE_MATRIX) { 282 B = *newmat; 283 } else { 284 if (PetscDefined(USE_DEBUG) || !a->ilen) { 285 PetscCall(PetscMalloc1(m, &rowlengths)); 286 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 287 } 288 if (PetscDefined(USE_DEBUG) && a->ilen) { 289 PetscBool eq; 290 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 291 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 292 PetscCall(PetscFree(rowlengths)); 293 rowlengths = a->ilen; 294 } else if (a->ilen) rowlengths = a->ilen; 295 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 296 PetscCall(MatSetSizes(B, m, n, m, n)); 297 PetscCall(MatSetType(B, MATSEQSELL)); 298 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 299 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 300 } 301 302 for (row = 0; row < m; row++) { 303 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 304 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 305 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 306 } 307 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 308 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 309 B->rmap->bs = A->rmap->bs; 310 311 if (reuse == MAT_INPLACE_MATRIX) { 312 PetscCall(MatHeaderReplace(A, &B)); 313 } else { 314 *newmat = B; 315 } 316 PetscFunctionReturn(PETSC_SUCCESS); 317 } 318 319 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 320 { 321 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 322 PetscScalar *y; 323 const PetscScalar *x; 324 const MatScalar *aval = a->val; 325 PetscInt totalslices = a->totalslices; 326 const PetscInt *acolidx = a->colidx; 327 PetscInt i, j; 328 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 329 __m512d vec_x, vec_y, vec_vals; 330 __m256i vec_idx; 331 __mmask8 mask; 332 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 333 __m256i vec_idx2, vec_idx3, vec_idx4; 334 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 335 __m128i vec_idx; 336 __m256d vec_x, vec_y, vec_y2, vec_vals; 337 MatScalar yval; 338 PetscInt r, rows_left, row, nnz_in_row; 339 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 340 __m128d vec_x_tmp; 341 __m256d vec_x, vec_y, vec_y2, vec_vals; 342 MatScalar yval; 343 PetscInt r, rows_left, row, nnz_in_row; 344 #else 345 PetscInt k, sliceheight = a->sliceheight; 346 PetscScalar *sum; 347 #endif 348 349 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 350 #pragma disjoint(*x, *y, *aval) 351 #endif 352 353 PetscFunctionBegin; 354 PetscCall(VecGetArrayRead(xx, &x)); 355 PetscCall(VecGetArray(yy, &y)); 356 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 357 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 358 for (i = 0; i < totalslices; i++) { /* loop over slices */ 359 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 360 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 361 362 vec_y = _mm512_setzero_pd(); 363 vec_y2 = _mm512_setzero_pd(); 364 vec_y3 = _mm512_setzero_pd(); 365 vec_y4 = _mm512_setzero_pd(); 366 367 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 368 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 369 case 3: 370 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 371 acolidx += 8; 372 aval += 8; 373 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 374 acolidx += 8; 375 aval += 8; 376 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 377 acolidx += 8; 378 aval += 8; 379 j += 3; 380 break; 381 case 2: 382 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 383 acolidx += 8; 384 aval += 8; 385 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 386 acolidx += 8; 387 aval += 8; 388 j += 2; 389 break; 390 case 1: 391 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 392 acolidx += 8; 393 aval += 8; 394 j += 1; 395 break; 396 } 397 #pragma novector 398 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 399 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 400 acolidx += 8; 401 aval += 8; 402 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 403 acolidx += 8; 404 aval += 8; 405 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 406 acolidx += 8; 407 aval += 8; 408 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 409 acolidx += 8; 410 aval += 8; 411 } 412 413 vec_y = _mm512_add_pd(vec_y, vec_y2); 414 vec_y = _mm512_add_pd(vec_y, vec_y3); 415 vec_y = _mm512_add_pd(vec_y, vec_y4); 416 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 417 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 418 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 419 } else { 420 _mm512_storeu_pd(&y[8 * i], vec_y); 421 } 422 } 423 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 424 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 425 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 426 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 427 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 428 429 /* last slice may have padding rows. Don't use vectorization. */ 430 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 431 rows_left = A->rmap->n - 8 * i; 432 for (r = 0; r < rows_left; ++r) { 433 yval = (MatScalar)0; 434 row = 8 * i + r; 435 nnz_in_row = a->rlen[row]; 436 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 437 y[row] = yval; 438 } 439 break; 440 } 441 442 vec_y = _mm256_setzero_pd(); 443 vec_y2 = _mm256_setzero_pd(); 444 445 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 446 #pragma novector 447 #pragma unroll(2) 448 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 449 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 450 aval += 4; 451 acolidx += 4; 452 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 453 aval += 4; 454 acolidx += 4; 455 } 456 457 _mm256_storeu_pd(y + i * 8, vec_y); 458 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 459 } 460 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 461 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 462 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 463 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 464 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 465 466 vec_y = _mm256_setzero_pd(); 467 vec_y2 = _mm256_setzero_pd(); 468 469 /* last slice may have padding rows. Don't use vectorization. */ 470 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 471 rows_left = A->rmap->n - 8 * i; 472 for (r = 0; r < rows_left; ++r) { 473 yval = (MatScalar)0; 474 row = 8 * i + r; 475 nnz_in_row = a->rlen[row]; 476 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 477 y[row] = yval; 478 } 479 break; 480 } 481 482 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 483 #pragma novector 484 #pragma unroll(2) 485 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 486 vec_vals = _mm256_loadu_pd(aval); 487 vec_x_tmp = _mm_setzero_pd(); 488 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 489 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 490 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 491 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 492 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 493 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 494 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 495 aval += 4; 496 497 vec_vals = _mm256_loadu_pd(aval); 498 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 499 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 500 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 501 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 502 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 503 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 504 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 505 aval += 4; 506 } 507 508 _mm256_storeu_pd(y + i * 8, vec_y); 509 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 510 } 511 #else 512 PetscCall(PetscMalloc1(sliceheight, &sum)); 513 for (i = 0; i < totalslices; i++) { /* loop over slices */ 514 for (j = 0; j < sliceheight; j++) { 515 sum[j] = 0.0; 516 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 517 } 518 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */ 519 for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j]; 520 } else { 521 for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j]; 522 } 523 } 524 PetscCall(PetscFree(sum)); 525 #endif 526 527 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 528 PetscCall(VecRestoreArrayRead(xx, &x)); 529 PetscCall(VecRestoreArray(yy, &y)); 530 PetscFunctionReturn(PETSC_SUCCESS); 531 } 532 533 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 534 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 535 { 536 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 537 PetscScalar *y, *z; 538 const PetscScalar *x; 539 const MatScalar *aval = a->val; 540 PetscInt totalslices = a->totalslices; 541 const PetscInt *acolidx = a->colidx; 542 PetscInt i, j; 543 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 544 __m512d vec_x, vec_y, vec_vals; 545 __m256i vec_idx; 546 __mmask8 mask = 0; 547 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 548 __m256i vec_idx2, vec_idx3, vec_idx4; 549 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 550 __m128d vec_x_tmp; 551 __m256d vec_x, vec_y, vec_y2, vec_vals; 552 MatScalar yval; 553 PetscInt r, row, nnz_in_row; 554 #else 555 PetscInt k, sliceheight = a->sliceheight; 556 PetscScalar *sum; 557 #endif 558 559 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 560 #pragma disjoint(*x, *y, *aval) 561 #endif 562 563 PetscFunctionBegin; 564 if (!a->nz) { 565 PetscCall(VecCopy(yy, zz)); 566 PetscFunctionReturn(PETSC_SUCCESS); 567 } 568 PetscCall(VecGetArrayRead(xx, &x)); 569 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 570 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 571 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 572 for (i = 0; i < totalslices; i++) { /* loop over slices */ 573 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 574 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 575 576 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 577 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 578 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 579 } else { 580 vec_y = _mm512_loadu_pd(&y[8 * i]); 581 } 582 vec_y2 = _mm512_setzero_pd(); 583 vec_y3 = _mm512_setzero_pd(); 584 vec_y4 = _mm512_setzero_pd(); 585 586 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 587 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 588 case 3: 589 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 590 acolidx += 8; 591 aval += 8; 592 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 593 acolidx += 8; 594 aval += 8; 595 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 596 acolidx += 8; 597 aval += 8; 598 j += 3; 599 break; 600 case 2: 601 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 602 acolidx += 8; 603 aval += 8; 604 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 605 acolidx += 8; 606 aval += 8; 607 j += 2; 608 break; 609 case 1: 610 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 611 acolidx += 8; 612 aval += 8; 613 j += 1; 614 break; 615 } 616 #pragma novector 617 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 618 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 619 acolidx += 8; 620 aval += 8; 621 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 622 acolidx += 8; 623 aval += 8; 624 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 625 acolidx += 8; 626 aval += 8; 627 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 628 acolidx += 8; 629 aval += 8; 630 } 631 632 vec_y = _mm512_add_pd(vec_y, vec_y2); 633 vec_y = _mm512_add_pd(vec_y, vec_y3); 634 vec_y = _mm512_add_pd(vec_y, vec_y4); 635 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 636 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 637 } else { 638 _mm512_storeu_pd(&z[8 * i], vec_y); 639 } 640 } 641 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 642 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 643 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 644 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 645 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 646 647 /* last slice may have padding rows. Don't use vectorization. */ 648 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 649 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 650 row = 8 * i + r; 651 yval = (MatScalar)0.0; 652 nnz_in_row = a->rlen[row]; 653 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 654 z[row] = y[row] + yval; 655 } 656 break; 657 } 658 659 vec_y = _mm256_loadu_pd(y + 8 * i); 660 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 661 662 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 663 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 664 vec_vals = _mm256_loadu_pd(aval); 665 vec_x_tmp = _mm_setzero_pd(); 666 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 667 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 668 vec_x = _mm256_setzero_pd(); 669 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 670 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 671 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 672 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 673 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 674 aval += 4; 675 676 vec_vals = _mm256_loadu_pd(aval); 677 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 678 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 679 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 680 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 681 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 682 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 683 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 684 aval += 4; 685 } 686 687 _mm256_storeu_pd(z + i * 8, vec_y); 688 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 689 } 690 #else 691 PetscCall(PetscMalloc1(sliceheight, &sum)); 692 for (i = 0; i < totalslices; i++) { /* loop over slices */ 693 for (j = 0; j < sliceheight; j++) { 694 sum[j] = 0.0; 695 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 696 } 697 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 698 for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 699 } else { 700 for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 701 } 702 } 703 PetscCall(PetscFree(sum)); 704 #endif 705 706 PetscCall(PetscLogFlops(2.0 * a->nz)); 707 PetscCall(VecRestoreArrayRead(xx, &x)); 708 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 709 PetscFunctionReturn(PETSC_SUCCESS); 710 } 711 712 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 713 { 714 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 715 PetscScalar *y; 716 const PetscScalar *x; 717 const MatScalar *aval = a->val; 718 const PetscInt *acolidx = a->colidx; 719 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight; 720 721 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 722 #pragma disjoint(*x, *y, *aval) 723 #endif 724 725 PetscFunctionBegin; 726 if (A->symmetric == PETSC_BOOL3_TRUE) { 727 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 728 PetscFunctionReturn(PETSC_SUCCESS); 729 } 730 if (zz != yy) PetscCall(VecCopy(zz, yy)); 731 732 if (a->nz) { 733 PetscCall(VecGetArrayRead(xx, &x)); 734 PetscCall(VecGetArray(yy, &y)); 735 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 736 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 737 for (r = 0; r < (A->rmap->n % sliceheight); ++r) { 738 row = sliceheight * i + r; 739 nnz_in_row = a->rlen[row]; 740 for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row]; 741 } 742 break; 743 } 744 for (r = 0; r < sliceheight; ++r) 745 for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r]; 746 } 747 PetscCall(PetscLogFlops(2.0 * a->nz)); 748 PetscCall(VecRestoreArrayRead(xx, &x)); 749 PetscCall(VecRestoreArray(yy, &y)); 750 } 751 PetscFunctionReturn(PETSC_SUCCESS); 752 } 753 754 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 755 { 756 PetscFunctionBegin; 757 if (A->symmetric == PETSC_BOOL3_TRUE) { 758 PetscCall(MatMult_SeqSELL(A, xx, yy)); 759 } else { 760 PetscCall(VecSet(yy, 0.0)); 761 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 762 } 763 PetscFunctionReturn(PETSC_SUCCESS); 764 } 765 766 /* 767 Checks for missing diagonals 768 */ 769 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 770 { 771 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 772 PetscInt *diag, i; 773 774 PetscFunctionBegin; 775 *missing = PETSC_FALSE; 776 if (A->rmap->n > 0 && !(a->colidx)) { 777 *missing = PETSC_TRUE; 778 if (d) *d = 0; 779 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 780 } else { 781 diag = a->diag; 782 for (i = 0; i < A->rmap->n; i++) { 783 if (diag[i] == -1) { 784 *missing = PETSC_TRUE; 785 if (d) *d = i; 786 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 787 break; 788 } 789 } 790 } 791 PetscFunctionReturn(PETSC_SUCCESS); 792 } 793 794 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 795 { 796 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 797 PetscInt i, j, m = A->rmap->n, shift; 798 799 PetscFunctionBegin; 800 if (!a->diag) { 801 PetscCall(PetscMalloc1(m, &a->diag)); 802 a->free_diag = PETSC_TRUE; 803 } 804 for (i = 0; i < m; i++) { /* loop over rows */ 805 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 806 a->diag[i] = -1; 807 for (j = 0; j < a->rlen[i]; j++) { 808 if (a->colidx[shift + a->sliceheight * j] == i) { 809 a->diag[i] = shift + a->sliceheight * j; 810 break; 811 } 812 } 813 } 814 PetscFunctionReturn(PETSC_SUCCESS); 815 } 816 817 /* 818 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 819 */ 820 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 821 { 822 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 823 PetscInt i, *diag, m = A->rmap->n; 824 MatScalar *val = a->val; 825 PetscScalar *idiag, *mdiag; 826 827 PetscFunctionBegin; 828 if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS); 829 PetscCall(MatMarkDiagonal_SeqSELL(A)); 830 diag = a->diag; 831 if (!a->idiag) { 832 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 833 val = a->val; 834 } 835 mdiag = a->mdiag; 836 idiag = a->idiag; 837 838 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 839 for (i = 0; i < m; i++) { 840 mdiag[i] = val[diag[i]]; 841 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 842 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 843 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 844 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 845 A->factorerror_zeropivot_value = 0.0; 846 A->factorerror_zeropivot_row = i; 847 } 848 idiag[i] = 1.0 / val[diag[i]]; 849 } 850 PetscCall(PetscLogFlops(m)); 851 } else { 852 for (i = 0; i < m; i++) { 853 mdiag[i] = val[diag[i]]; 854 idiag[i] = omega / (fshift + val[diag[i]]); 855 } 856 PetscCall(PetscLogFlops(2.0 * m)); 857 } 858 a->idiagvalid = PETSC_TRUE; 859 PetscFunctionReturn(PETSC_SUCCESS); 860 } 861 862 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 863 { 864 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 865 866 PetscFunctionBegin; 867 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 868 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 869 PetscFunctionReturn(PETSC_SUCCESS); 870 } 871 872 PetscErrorCode MatDestroy_SeqSELL(Mat A) 873 { 874 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 875 876 PetscFunctionBegin; 877 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 878 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 879 PetscCall(ISDestroy(&a->row)); 880 PetscCall(ISDestroy(&a->col)); 881 PetscCall(PetscFree(a->diag)); 882 PetscCall(PetscFree(a->rlen)); 883 PetscCall(PetscFree(a->sliidx)); 884 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 885 PetscCall(PetscFree(a->solve_work)); 886 PetscCall(ISDestroy(&a->icol)); 887 PetscCall(PetscFree(a->saved_values)); 888 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 889 PetscCall(PetscFree(A->data)); 890 #if defined(PETSC_HAVE_CUDA) 891 PetscCall(PetscFree(a->chunk_slice_map)); 892 #endif 893 894 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 895 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 896 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 897 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 898 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 899 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 900 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 901 #if defined(PETSC_HAVE_CUDA) 902 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL)); 903 #endif 904 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL)); 905 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL)); 906 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL)); 907 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL)); 908 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL)); 909 PetscFunctionReturn(PETSC_SUCCESS); 910 } 911 912 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 913 { 914 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 915 916 PetscFunctionBegin; 917 switch (op) { 918 case MAT_ROW_ORIENTED: 919 a->roworiented = flg; 920 break; 921 case MAT_KEEP_NONZERO_PATTERN: 922 a->keepnonzeropattern = flg; 923 break; 924 case MAT_NEW_NONZERO_LOCATIONS: 925 a->nonew = (flg ? 0 : 1); 926 break; 927 case MAT_NEW_NONZERO_LOCATION_ERR: 928 a->nonew = (flg ? -1 : 0); 929 break; 930 case MAT_NEW_NONZERO_ALLOCATION_ERR: 931 a->nonew = (flg ? -2 : 0); 932 break; 933 case MAT_UNUSED_NONZERO_LOCATION_ERR: 934 a->nounused = (flg ? -1 : 0); 935 break; 936 case MAT_FORCE_DIAGONAL_ENTRIES: 937 case MAT_IGNORE_OFF_PROC_ENTRIES: 938 case MAT_USE_HASH_TABLE: 939 case MAT_SORTED_FULL: 940 PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); 941 break; 942 case MAT_SPD: 943 case MAT_SYMMETRIC: 944 case MAT_STRUCTURALLY_SYMMETRIC: 945 case MAT_HERMITIAN: 946 case MAT_SYMMETRY_ETERNAL: 947 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 948 case MAT_SPD_ETERNAL: 949 /* These options are handled directly by MatSetOption() */ 950 break; 951 default: 952 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 953 } 954 PetscFunctionReturn(PETSC_SUCCESS); 955 } 956 957 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 958 { 959 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 960 PetscInt i, j, n, shift; 961 PetscScalar *x, zero = 0.0; 962 963 PetscFunctionBegin; 964 PetscCall(VecGetLocalSize(v, &n)); 965 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 966 967 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 968 PetscInt *diag = a->diag; 969 PetscCall(VecGetArray(v, &x)); 970 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 971 PetscCall(VecRestoreArray(v, &x)); 972 PetscFunctionReturn(PETSC_SUCCESS); 973 } 974 975 PetscCall(VecSet(v, zero)); 976 PetscCall(VecGetArray(v, &x)); 977 for (i = 0; i < n; i++) { /* loop over rows */ 978 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 979 x[i] = 0; 980 for (j = 0; j < a->rlen[i]; j++) { 981 if (a->colidx[shift + a->sliceheight * j] == i) { 982 x[i] = a->val[shift + a->sliceheight * j]; 983 break; 984 } 985 } 986 } 987 PetscCall(VecRestoreArray(v, &x)); 988 PetscFunctionReturn(PETSC_SUCCESS); 989 } 990 991 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 992 { 993 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 994 const PetscScalar *l, *r; 995 PetscInt i, j, m, n, row; 996 997 PetscFunctionBegin; 998 if (ll) { 999 /* The local size is used so that VecMPI can be passed to this routine 1000 by MatDiagonalScale_MPISELL */ 1001 PetscCall(VecGetLocalSize(ll, &m)); 1002 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1003 PetscCall(VecGetArrayRead(ll, &l)); 1004 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1005 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1006 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1007 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row]; 1008 } 1009 } else { 1010 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; } 1011 } 1012 } 1013 PetscCall(VecRestoreArrayRead(ll, &l)); 1014 PetscCall(PetscLogFlops(a->nz)); 1015 } 1016 if (rr) { 1017 PetscCall(VecGetLocalSize(rr, &n)); 1018 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1019 PetscCall(VecGetArrayRead(rr, &r)); 1020 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1021 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1022 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) { 1023 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]]; 1024 } 1025 } else { 1026 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 1027 } 1028 } 1029 PetscCall(VecRestoreArrayRead(rr, &r)); 1030 PetscCall(PetscLogFlops(a->nz)); 1031 } 1032 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1033 #if defined(PETSC_HAVE_CUDA) 1034 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1035 #endif 1036 PetscFunctionReturn(PETSC_SUCCESS); 1037 } 1038 1039 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1040 { 1041 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1042 PetscInt *cp, i, k, low, high, t, row, col, l; 1043 PetscInt shift; 1044 MatScalar *vp; 1045 1046 PetscFunctionBegin; 1047 for (k = 0; k < m; k++) { /* loop over requested rows */ 1048 row = im[k]; 1049 if (row < 0) continue; 1050 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1051 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */ 1052 cp = a->colidx + shift; /* pointer to the row */ 1053 vp = a->val + shift; /* pointer to the row */ 1054 for (l = 0; l < n; l++) { /* loop over requested columns */ 1055 col = in[l]; 1056 if (col < 0) continue; 1057 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1058 high = a->rlen[row]; 1059 low = 0; /* assume unsorted */ 1060 while (high - low > 5) { 1061 t = (low + high) / 2; 1062 if (*(cp + a->sliceheight * t) > col) high = t; 1063 else low = t; 1064 } 1065 for (i = low; i < high; i++) { 1066 if (*(cp + a->sliceheight * i) > col) break; 1067 if (*(cp + a->sliceheight * i) == col) { 1068 *v++ = *(vp + a->sliceheight * i); 1069 goto finished; 1070 } 1071 } 1072 *v++ = 0.0; 1073 finished:; 1074 } 1075 } 1076 PetscFunctionReturn(PETSC_SUCCESS); 1077 } 1078 1079 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1080 { 1081 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1082 PetscInt i, j, m = A->rmap->n, shift; 1083 const char *name; 1084 PetscViewerFormat format; 1085 1086 PetscFunctionBegin; 1087 PetscCall(PetscViewerGetFormat(viewer, &format)); 1088 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1089 PetscInt nofinalvalue = 0; 1090 /* 1091 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1092 nofinalvalue = 1; 1093 } 1094 */ 1095 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1096 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1097 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1098 #if defined(PETSC_USE_COMPLEX) 1099 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1100 #else 1101 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1102 #endif 1103 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1104 1105 for (i = 0; i < m; i++) { 1106 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1107 for (j = 0; j < a->rlen[i]; j++) { 1108 #if defined(PETSC_USE_COMPLEX) 1109 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1110 #else 1111 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j])); 1112 #endif 1113 } 1114 } 1115 /* 1116 if (nofinalvalue) { 1117 #if defined(PETSC_USE_COMPLEX) 1118 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1119 #else 1120 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1121 #endif 1122 } 1123 */ 1124 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1125 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1126 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1127 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1128 PetscFunctionReturn(PETSC_SUCCESS); 1129 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1130 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1131 for (i = 0; i < m; i++) { 1132 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1133 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1134 for (j = 0; j < a->rlen[i]; j++) { 1135 #if defined(PETSC_USE_COMPLEX) 1136 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1137 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1138 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1139 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1140 } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1141 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1142 } 1143 #else 1144 if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1145 #endif 1146 } 1147 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1148 } 1149 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1150 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1151 PetscInt cnt = 0, jcnt; 1152 PetscScalar value; 1153 #if defined(PETSC_USE_COMPLEX) 1154 PetscBool realonly = PETSC_TRUE; 1155 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1156 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1157 realonly = PETSC_FALSE; 1158 break; 1159 } 1160 } 1161 #endif 1162 1163 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1164 for (i = 0; i < m; i++) { 1165 jcnt = 0; 1166 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1167 for (j = 0; j < A->cmap->n; j++) { 1168 if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) { 1169 value = a->val[cnt++]; 1170 jcnt++; 1171 } else { 1172 value = 0.0; 1173 } 1174 #if defined(PETSC_USE_COMPLEX) 1175 if (realonly) { 1176 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1177 } else { 1178 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1179 } 1180 #else 1181 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1182 #endif 1183 } 1184 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1185 } 1186 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1187 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1188 PetscInt fshift = 1; 1189 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1190 #if defined(PETSC_USE_COMPLEX) 1191 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1192 #else 1193 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1194 #endif 1195 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1196 for (i = 0; i < m; i++) { 1197 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1198 for (j = 0; j < a->rlen[i]; j++) { 1199 #if defined(PETSC_USE_COMPLEX) 1200 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1201 #else 1202 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j])); 1203 #endif 1204 } 1205 } 1206 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1207 } else if (format == PETSC_VIEWER_NATIVE) { 1208 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1209 PetscInt row; 1210 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1211 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1212 #if defined(PETSC_USE_COMPLEX) 1213 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1214 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1215 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1216 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1217 } else { 1218 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1219 } 1220 #else 1221 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j])); 1222 #endif 1223 } 1224 } 1225 } else { 1226 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1227 if (A->factortype) { 1228 for (i = 0; i < m; i++) { 1229 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1230 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1231 /* L part */ 1232 for (j = shift; j < a->diag[i]; j += a->sliceheight) { 1233 #if defined(PETSC_USE_COMPLEX) 1234 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) { 1235 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1236 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) { 1237 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1238 } else { 1239 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1240 } 1241 #else 1242 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1243 #endif 1244 } 1245 /* diagonal */ 1246 j = a->diag[i]; 1247 #if defined(PETSC_USE_COMPLEX) 1248 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1249 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1250 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1251 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1252 } else { 1253 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1254 } 1255 #else 1256 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1257 #endif 1258 1259 /* U part */ 1260 for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) { 1261 #if defined(PETSC_USE_COMPLEX) 1262 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1263 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1264 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1265 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1266 } else { 1267 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1268 } 1269 #else 1270 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1271 #endif 1272 } 1273 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1274 } 1275 } else { 1276 for (i = 0; i < m; i++) { 1277 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1278 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1279 for (j = 0; j < a->rlen[i]; j++) { 1280 #if defined(PETSC_USE_COMPLEX) 1281 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1282 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1283 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1284 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1285 } else { 1286 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1287 } 1288 #else 1289 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1290 #endif 1291 } 1292 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1293 } 1294 } 1295 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1296 } 1297 PetscCall(PetscViewerFlush(viewer)); 1298 PetscFunctionReturn(PETSC_SUCCESS); 1299 } 1300 1301 #include <petscdraw.h> 1302 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1303 { 1304 Mat A = (Mat)Aa; 1305 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1306 PetscInt i, j, m = A->rmap->n, shift; 1307 int color; 1308 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1309 PetscViewer viewer; 1310 PetscViewerFormat format; 1311 1312 PetscFunctionBegin; 1313 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1314 PetscCall(PetscViewerGetFormat(viewer, &format)); 1315 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1316 1317 /* loop over matrix elements drawing boxes */ 1318 1319 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1320 PetscDrawCollectiveBegin(draw); 1321 /* Blue for negative, Cyan for zero and Red for positive */ 1322 color = PETSC_DRAW_BLUE; 1323 for (i = 0; i < m; i++) { 1324 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1325 y_l = m - i - 1.0; 1326 y_r = y_l + 1.0; 1327 for (j = 0; j < a->rlen[i]; j++) { 1328 x_l = a->colidx[shift + a->sliceheight * j]; 1329 x_r = x_l + 1.0; 1330 if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue; 1331 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1332 } 1333 } 1334 color = PETSC_DRAW_CYAN; 1335 for (i = 0; i < m; i++) { 1336 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1337 y_l = m - i - 1.0; 1338 y_r = y_l + 1.0; 1339 for (j = 0; j < a->rlen[i]; j++) { 1340 x_l = a->colidx[shift + a->sliceheight * j]; 1341 x_r = x_l + 1.0; 1342 if (a->val[shift + a->sliceheight * j] != 0.) continue; 1343 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1344 } 1345 } 1346 color = PETSC_DRAW_RED; 1347 for (i = 0; i < m; i++) { 1348 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1349 y_l = m - i - 1.0; 1350 y_r = y_l + 1.0; 1351 for (j = 0; j < a->rlen[i]; j++) { 1352 x_l = a->colidx[shift + a->sliceheight * j]; 1353 x_r = x_l + 1.0; 1354 if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue; 1355 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1356 } 1357 } 1358 PetscDrawCollectiveEnd(draw); 1359 } else { 1360 /* use contour shading to indicate magnitude of values */ 1361 /* first determine max of all nonzero values */ 1362 PetscReal minv = 0.0, maxv = 0.0; 1363 PetscInt count = 0; 1364 PetscDraw popup; 1365 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1366 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1367 } 1368 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1369 PetscCall(PetscDrawGetPopup(draw, &popup)); 1370 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1371 1372 PetscDrawCollectiveBegin(draw); 1373 for (i = 0; i < m; i++) { 1374 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1375 y_l = m - i - 1.0; 1376 y_r = y_l + 1.0; 1377 for (j = 0; j < a->rlen[i]; j++) { 1378 x_l = a->colidx[shift + a->sliceheight * j]; 1379 x_r = x_l + 1.0; 1380 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1381 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1382 count++; 1383 } 1384 } 1385 PetscDrawCollectiveEnd(draw); 1386 } 1387 PetscFunctionReturn(PETSC_SUCCESS); 1388 } 1389 1390 #include <petscdraw.h> 1391 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1392 { 1393 PetscDraw draw; 1394 PetscReal xr, yr, xl, yl, h, w; 1395 PetscBool isnull; 1396 1397 PetscFunctionBegin; 1398 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1399 PetscCall(PetscDrawIsNull(draw, &isnull)); 1400 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1401 1402 xr = A->cmap->n; 1403 yr = A->rmap->n; 1404 h = yr / 10.0; 1405 w = xr / 10.0; 1406 xr += w; 1407 yr += h; 1408 xl = -w; 1409 yl = -h; 1410 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1411 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1412 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1413 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1414 PetscCall(PetscDrawSave(draw)); 1415 PetscFunctionReturn(PETSC_SUCCESS); 1416 } 1417 1418 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1419 { 1420 PetscBool iascii, isbinary, isdraw; 1421 1422 PetscFunctionBegin; 1423 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1424 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1425 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1426 if (iascii) { 1427 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1428 } else if (isbinary) { 1429 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1430 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1431 PetscFunctionReturn(PETSC_SUCCESS); 1432 } 1433 1434 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1435 { 1436 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1437 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1438 MatScalar *vp; 1439 #if defined(PETSC_HAVE_CUDA) 1440 PetscInt totalchunks = 0; 1441 #endif 1442 1443 PetscFunctionBegin; 1444 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1445 /* To do: compress out the unused elements */ 1446 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1447 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1448 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1449 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1450 a->nonzerorowcnt = 0; 1451 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1452 for (i = 0; i < a->totalslices; ++i) { 1453 shift = a->sliidx[i]; /* starting index of the slice */ 1454 cp = a->colidx + shift; /* pointer to the column indices of the slice */ 1455 vp = a->val + shift; /* pointer to the nonzero values of the slice */ 1456 for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */ 1457 row = a->sliceheight * i + row_in_slice; 1458 nrow = a->rlen[row]; /* number of nonzeros in row */ 1459 /* 1460 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1461 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1462 */ 1463 lastcol = 0; 1464 if (nrow > 0) { /* nonempty row */ 1465 a->nonzerorowcnt++; 1466 lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1467 } else if (!row_in_slice) { /* first row of the correct slice is empty */ 1468 for (j = 1; j < a->sliceheight; j++) { 1469 if (a->rlen[a->sliceheight * i + j]) { 1470 lastcol = cp[j]; 1471 break; 1472 } 1473 } 1474 } else { 1475 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1476 } 1477 1478 for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) { 1479 cp[a->sliceheight * k + row_in_slice] = lastcol; 1480 vp[a->sliceheight * k + row_in_slice] = (MatScalar)0; 1481 } 1482 } 1483 } 1484 1485 A->info.mallocs += a->reallocs; 1486 a->reallocs = 0; 1487 1488 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1489 #if defined(PETSC_HAVE_CUDA) 1490 if (!a->chunksize && a->totalslices) { 1491 a->chunksize = 64; 1492 while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2; 1493 totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize; 1494 } 1495 if (totalchunks != a->totalchunks) { 1496 PetscCall(PetscFree(a->chunk_slice_map)); 1497 PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map)); 1498 a->totalchunks = totalchunks; 1499 } 1500 j = 0; 1501 for (i = 0; i < totalchunks; i++) { 1502 while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++; 1503 a->chunk_slice_map[i] = j; 1504 } 1505 #endif 1506 PetscFunctionReturn(PETSC_SUCCESS); 1507 } 1508 1509 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1510 { 1511 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1512 1513 PetscFunctionBegin; 1514 info->block_size = 1.0; 1515 info->nz_allocated = a->maxallocmat; 1516 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1517 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1518 info->assemblies = A->num_ass; 1519 info->mallocs = A->info.mallocs; 1520 info->memory = 0; /* REVIEW ME */ 1521 if (A->factortype) { 1522 info->fill_ratio_given = A->info.fill_ratio_given; 1523 info->fill_ratio_needed = A->info.fill_ratio_needed; 1524 info->factor_mallocs = A->info.factor_mallocs; 1525 } else { 1526 info->fill_ratio_given = 0; 1527 info->fill_ratio_needed = 0; 1528 info->factor_mallocs = 0; 1529 } 1530 PetscFunctionReturn(PETSC_SUCCESS); 1531 } 1532 1533 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1534 { 1535 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1536 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1537 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1538 MatScalar *vp, value; 1539 #if defined(PETSC_HAVE_CUDA) 1540 PetscBool inserted = PETSC_FALSE; 1541 PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight; 1542 #endif 1543 1544 PetscFunctionBegin; 1545 for (k = 0; k < m; k++) { /* loop over added rows */ 1546 row = im[k]; 1547 if (row < 0) continue; 1548 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1549 shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */ 1550 cp = a->colidx + shift; /* pointer to the row */ 1551 vp = a->val + shift; /* pointer to the row */ 1552 nrow = a->rlen[row]; 1553 low = 0; 1554 high = nrow; 1555 1556 for (l = 0; l < n; l++) { /* loop over added columns */ 1557 col = in[l]; 1558 if (col < 0) continue; 1559 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1560 if (a->roworiented) { 1561 value = v[l + k * n]; 1562 } else { 1563 value = v[k + l * m]; 1564 } 1565 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1566 1567 /* search in this row for the specified column, i indicates the column to be set */ 1568 if (col <= lastcol) low = 0; 1569 else high = nrow; 1570 lastcol = col; 1571 while (high - low > 5) { 1572 t = (low + high) / 2; 1573 if (*(cp + a->sliceheight * t) > col) high = t; 1574 else low = t; 1575 } 1576 for (i = low; i < high; i++) { 1577 if (*(cp + a->sliceheight * i) > col) break; 1578 if (*(cp + a->sliceheight * i) == col) { 1579 if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value; 1580 else *(vp + a->sliceheight * i) = value; 1581 #if defined(PETSC_HAVE_CUDA) 1582 inserted = PETSC_TRUE; 1583 #endif 1584 low = i + 1; 1585 goto noinsert; 1586 } 1587 } 1588 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1589 if (nonew == 1) goto noinsert; 1590 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1591 #if defined(PETSC_HAVE_CUDA) 1592 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul); 1593 #else 1594 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1595 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1); 1596 #endif 1597 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1598 for (ii = nrow - 1; ii >= i; ii--) { 1599 *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); 1600 *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); 1601 } 1602 a->rlen[row]++; 1603 *(cp + a->sliceheight * i) = col; 1604 *(vp + a->sliceheight * i) = value; 1605 a->nz++; 1606 A->nonzerostate++; 1607 #if defined(PETSC_HAVE_CUDA) 1608 inserted = PETSC_TRUE; 1609 #endif 1610 low = i + 1; 1611 high++; 1612 nrow++; 1613 noinsert:; 1614 } 1615 a->rlen[row] = nrow; 1616 } 1617 #if defined(PETSC_HAVE_CUDA) 1618 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU; 1619 #endif 1620 PetscFunctionReturn(PETSC_SUCCESS); 1621 } 1622 1623 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1624 { 1625 PetscFunctionBegin; 1626 /* If the two matrices have the same copy implementation, use fast copy. */ 1627 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1628 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1629 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1630 1631 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1632 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1633 } else { 1634 PetscCall(MatCopy_Basic(A, B, str)); 1635 } 1636 PetscFunctionReturn(PETSC_SUCCESS); 1637 } 1638 1639 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1640 { 1641 PetscFunctionBegin; 1642 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1643 PetscFunctionReturn(PETSC_SUCCESS); 1644 } 1645 1646 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1647 { 1648 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1649 1650 PetscFunctionBegin; 1651 *array = a->val; 1652 PetscFunctionReturn(PETSC_SUCCESS); 1653 } 1654 1655 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1656 { 1657 PetscFunctionBegin; 1658 PetscFunctionReturn(PETSC_SUCCESS); 1659 } 1660 1661 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1662 { 1663 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1664 MatScalar *aval = a->val; 1665 PetscScalar oalpha = alpha; 1666 PetscBLASInt one = 1, size; 1667 1668 PetscFunctionBegin; 1669 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1670 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1671 PetscCall(PetscLogFlops(a->nz)); 1672 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1673 #if defined(PETSC_HAVE_CUDA) 1674 if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU; 1675 #endif 1676 PetscFunctionReturn(PETSC_SUCCESS); 1677 } 1678 1679 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1680 { 1681 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1682 1683 PetscFunctionBegin; 1684 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1685 PetscCall(MatShift_Basic(Y, a)); 1686 PetscFunctionReturn(PETSC_SUCCESS); 1687 } 1688 1689 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1690 { 1691 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1692 PetscScalar *x, sum, *t; 1693 const MatScalar *idiag = NULL, *mdiag; 1694 const PetscScalar *b, *xb; 1695 PetscInt n, m = A->rmap->n, i, j, shift; 1696 const PetscInt *diag; 1697 1698 PetscFunctionBegin; 1699 its = its * lits; 1700 1701 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1702 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1703 a->fshift = fshift; 1704 a->omega = omega; 1705 1706 diag = a->diag; 1707 t = a->ssor_work; 1708 idiag = a->idiag; 1709 mdiag = a->mdiag; 1710 1711 PetscCall(VecGetArray(xx, &x)); 1712 PetscCall(VecGetArrayRead(bb, &b)); 1713 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1714 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1715 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1716 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1717 1718 if (flag & SOR_ZERO_INITIAL_GUESS) { 1719 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1720 for (i = 0; i < m; i++) { 1721 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1722 sum = b[i]; 1723 n = (diag[i] - shift) / a->sliceheight; 1724 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1725 t[i] = sum; 1726 x[i] = sum * idiag[i]; 1727 } 1728 xb = t; 1729 PetscCall(PetscLogFlops(a->nz)); 1730 } else xb = b; 1731 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1732 for (i = m - 1; i >= 0; i--) { 1733 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1734 sum = xb[i]; 1735 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1736 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1737 if (xb == b) { 1738 x[i] = sum * idiag[i]; 1739 } else { 1740 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1741 } 1742 } 1743 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1744 } 1745 its--; 1746 } 1747 while (its--) { 1748 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1749 for (i = 0; i < m; i++) { 1750 /* lower */ 1751 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1752 sum = b[i]; 1753 n = (diag[i] - shift) / a->sliceheight; 1754 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1755 t[i] = sum; /* save application of the lower-triangular part */ 1756 /* upper */ 1757 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1758 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1759 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1760 } 1761 xb = t; 1762 PetscCall(PetscLogFlops(2.0 * a->nz)); 1763 } else xb = b; 1764 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1765 for (i = m - 1; i >= 0; i--) { 1766 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1767 sum = xb[i]; 1768 if (xb == b) { 1769 /* whole matrix (no checkpointing available) */ 1770 n = a->rlen[i]; 1771 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1772 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1773 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1774 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1775 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1776 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1777 } 1778 } 1779 if (xb == b) { 1780 PetscCall(PetscLogFlops(2.0 * a->nz)); 1781 } else { 1782 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1783 } 1784 } 1785 } 1786 PetscCall(VecRestoreArray(xx, &x)); 1787 PetscCall(VecRestoreArrayRead(bb, &b)); 1788 PetscFunctionReturn(PETSC_SUCCESS); 1789 } 1790 1791 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1792 MatGetRow_SeqSELL, 1793 MatRestoreRow_SeqSELL, 1794 MatMult_SeqSELL, 1795 /* 4*/ MatMultAdd_SeqSELL, 1796 MatMultTranspose_SeqSELL, 1797 MatMultTransposeAdd_SeqSELL, 1798 NULL, 1799 NULL, 1800 NULL, 1801 /* 10*/ NULL, 1802 NULL, 1803 NULL, 1804 MatSOR_SeqSELL, 1805 NULL, 1806 /* 15*/ MatGetInfo_SeqSELL, 1807 MatEqual_SeqSELL, 1808 MatGetDiagonal_SeqSELL, 1809 MatDiagonalScale_SeqSELL, 1810 NULL, 1811 /* 20*/ NULL, 1812 MatAssemblyEnd_SeqSELL, 1813 MatSetOption_SeqSELL, 1814 MatZeroEntries_SeqSELL, 1815 /* 24*/ NULL, 1816 NULL, 1817 NULL, 1818 NULL, 1819 NULL, 1820 /* 29*/ MatSetUp_SeqSELL, 1821 NULL, 1822 NULL, 1823 NULL, 1824 NULL, 1825 /* 34*/ MatDuplicate_SeqSELL, 1826 NULL, 1827 NULL, 1828 NULL, 1829 NULL, 1830 /* 39*/ NULL, 1831 NULL, 1832 NULL, 1833 MatGetValues_SeqSELL, 1834 MatCopy_SeqSELL, 1835 /* 44*/ NULL, 1836 MatScale_SeqSELL, 1837 MatShift_SeqSELL, 1838 NULL, 1839 NULL, 1840 /* 49*/ NULL, 1841 NULL, 1842 NULL, 1843 NULL, 1844 NULL, 1845 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1846 NULL, 1847 NULL, 1848 NULL, 1849 NULL, 1850 /* 59*/ NULL, 1851 MatDestroy_SeqSELL, 1852 MatView_SeqSELL, 1853 NULL, 1854 NULL, 1855 /* 64*/ NULL, 1856 NULL, 1857 NULL, 1858 NULL, 1859 NULL, 1860 /* 69*/ NULL, 1861 NULL, 1862 NULL, 1863 NULL, 1864 NULL, 1865 /* 74*/ NULL, 1866 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1867 NULL, 1868 NULL, 1869 NULL, 1870 /* 79*/ NULL, 1871 NULL, 1872 NULL, 1873 NULL, 1874 NULL, 1875 /* 84*/ NULL, 1876 NULL, 1877 NULL, 1878 NULL, 1879 NULL, 1880 /* 89*/ NULL, 1881 NULL, 1882 NULL, 1883 NULL, 1884 NULL, 1885 /* 94*/ NULL, 1886 NULL, 1887 NULL, 1888 NULL, 1889 NULL, 1890 /* 99*/ NULL, 1891 NULL, 1892 NULL, 1893 MatConjugate_SeqSELL, 1894 NULL, 1895 /*104*/ NULL, 1896 NULL, 1897 NULL, 1898 NULL, 1899 NULL, 1900 /*109*/ NULL, 1901 NULL, 1902 NULL, 1903 NULL, 1904 MatMissingDiagonal_SeqSELL, 1905 /*114*/ NULL, 1906 NULL, 1907 NULL, 1908 NULL, 1909 NULL, 1910 /*119*/ NULL, 1911 NULL, 1912 NULL, 1913 NULL, 1914 NULL, 1915 /*124*/ NULL, 1916 NULL, 1917 NULL, 1918 NULL, 1919 NULL, 1920 /*129*/ NULL, 1921 NULL, 1922 NULL, 1923 NULL, 1924 NULL, 1925 /*134*/ NULL, 1926 NULL, 1927 NULL, 1928 NULL, 1929 NULL, 1930 /*139*/ NULL, 1931 NULL, 1932 NULL, 1933 MatFDColoringSetUp_SeqXAIJ, 1934 NULL, 1935 /*144*/ NULL, 1936 NULL, 1937 NULL, 1938 NULL, 1939 NULL, 1940 NULL, 1941 /*150*/ NULL, 1942 NULL}; 1943 1944 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1945 { 1946 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1947 1948 PetscFunctionBegin; 1949 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1950 1951 /* allocate space for values if not already there */ 1952 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1953 1954 /* copy values over */ 1955 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1956 PetscFunctionReturn(PETSC_SUCCESS); 1957 } 1958 1959 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1960 { 1961 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1962 1963 PetscFunctionBegin; 1964 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1965 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1966 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1967 PetscFunctionReturn(PETSC_SUCCESS); 1968 } 1969 1970 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio) 1971 { 1972 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1973 1974 PetscFunctionBegin; 1975 if (a->totalslices && a->sliidx[a->totalslices]) { 1976 *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices]; 1977 } else { 1978 *ratio = 0.0; 1979 } 1980 PetscFunctionReturn(PETSC_SUCCESS); 1981 } 1982 1983 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth) 1984 { 1985 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1986 PetscInt i, current_slicewidth; 1987 1988 PetscFunctionBegin; 1989 *slicewidth = 0; 1990 for (i = 0; i < a->totalslices; i++) { 1991 current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight; 1992 if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth; 1993 } 1994 PetscFunctionReturn(PETSC_SUCCESS); 1995 } 1996 1997 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth) 1998 { 1999 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2000 2001 PetscFunctionBegin; 2002 *slicewidth = 0; 2003 if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; } 2004 PetscFunctionReturn(PETSC_SUCCESS); 2005 } 2006 2007 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance) 2008 { 2009 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2010 PetscReal mean; 2011 PetscInt i, totalslices = a->totalslices, *sliidx = a->sliidx; 2012 2013 PetscFunctionBegin; 2014 *variance = 0; 2015 if (totalslices) { 2016 mean = (PetscReal)sliidx[totalslices] / totalslices; 2017 for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; } 2018 } 2019 PetscFunctionReturn(PETSC_SUCCESS); 2020 } 2021 2022 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight) 2023 { 2024 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2025 2026 PetscFunctionBegin; 2027 if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS); 2028 PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight); 2029 a->sliceheight = sliceheight; 2030 #if defined(PETSC_HAVE_CUDA) 2031 PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight); 2032 #endif 2033 PetscFunctionReturn(PETSC_SUCCESS); 2034 } 2035 2036 /*@C 2037 MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix. 2038 2039 Not Collective 2040 2041 Input Parameter: 2042 . A - a MATSEQSELL matrix 2043 2044 Output Parameter: 2045 . ratio - ratio of number of padded zeros to number of allocated elements 2046 2047 Level: intermediate 2048 2049 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2050 @*/ 2051 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio) 2052 { 2053 PetscFunctionBegin; 2054 PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio)); 2055 PetscFunctionReturn(PETSC_SUCCESS); 2056 } 2057 2058 /*@C 2059 MatSeqSELLGetMaxSliceWidth - returns the maximum slice width. 2060 2061 Not Collective 2062 2063 Input Parameter: 2064 . A - a MATSEQSELL matrix 2065 2066 Output Parameter: 2067 . slicewidth - maximum slice width 2068 2069 Level: intermediate 2070 2071 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2072 @*/ 2073 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth) 2074 { 2075 PetscFunctionBegin; 2076 PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth)); 2077 PetscFunctionReturn(PETSC_SUCCESS); 2078 } 2079 2080 /*@C 2081 MatSeqSELLGetAvgSliceWidth - returns the average slice width. 2082 2083 Not Collective 2084 2085 Input Parameter: 2086 . A - a MATSEQSELL matrix 2087 2088 Output Parameter: 2089 . slicewidth - average slice width 2090 2091 Level: intermediate 2092 2093 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()` 2094 @*/ 2095 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth) 2096 { 2097 PetscFunctionBegin; 2098 PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth)); 2099 PetscFunctionReturn(PETSC_SUCCESS); 2100 } 2101 2102 /*@C 2103 MatSeqSELLSetSliceHeight - sets the slice height. 2104 2105 Not Collective 2106 2107 Input Parameters: 2108 + A - a MATSEQSELL matrix 2109 - sliceheight - slice height 2110 2111 Notes: 2112 You cannot change the slice height once it have been set. 2113 2114 The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called. 2115 2116 Level: intermediate 2117 2118 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()` 2119 @*/ 2120 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight) 2121 { 2122 PetscFunctionBegin; 2123 PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight)); 2124 PetscFunctionReturn(PETSC_SUCCESS); 2125 } 2126 2127 /*@C 2128 MatSeqSELLGetVarSliceSize - returns the variance of the slice size. 2129 2130 Not Collective 2131 2132 Input Parameter: 2133 . A - a MATSEQSELL matrix 2134 2135 Output Parameter: 2136 . variance - variance of the slice size 2137 2138 Level: intermediate 2139 2140 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()` 2141 @*/ 2142 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance) 2143 { 2144 PetscFunctionBegin; 2145 PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance)); 2146 PetscFunctionReturn(PETSC_SUCCESS); 2147 } 2148 2149 #if defined(PETSC_HAVE_CUDA) 2150 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 2151 #endif 2152 2153 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 2154 { 2155 Mat_SeqSELL *b; 2156 PetscMPIInt size; 2157 2158 PetscFunctionBegin; 2159 PetscCall(PetscCitationsRegister(citation, &cited)); 2160 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 2161 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 2162 2163 PetscCall(PetscNew(&b)); 2164 2165 B->data = (void *)b; 2166 B->ops[0] = MatOps_Values; 2167 2168 b->row = NULL; 2169 b->col = NULL; 2170 b->icol = NULL; 2171 b->reallocs = 0; 2172 b->ignorezeroentries = PETSC_FALSE; 2173 b->roworiented = PETSC_TRUE; 2174 b->nonew = 0; 2175 b->diag = NULL; 2176 b->solve_work = NULL; 2177 B->spptr = NULL; 2178 b->saved_values = NULL; 2179 b->idiag = NULL; 2180 b->mdiag = NULL; 2181 b->ssor_work = NULL; 2182 b->omega = 1.0; 2183 b->fshift = 0.0; 2184 b->idiagvalid = PETSC_FALSE; 2185 b->keepnonzeropattern = PETSC_FALSE; 2186 b->sliceheight = 0; 2187 2188 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 2189 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 2190 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 2191 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 2192 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 2193 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 2194 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 2195 #if defined(PETSC_HAVE_CUDA) 2196 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA)); 2197 #endif 2198 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL)); 2199 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL)); 2200 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL)); 2201 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL)); 2202 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL)); 2203 2204 PetscObjectOptionsBegin((PetscObject)B); 2205 { 2206 PetscInt newsh = -1; 2207 PetscBool flg; 2208 #if defined(PETSC_HAVE_CUDA) 2209 PetscInt chunksize = 0; 2210 #endif 2211 2212 PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg)); 2213 if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); } 2214 #if defined(PETSC_HAVE_CUDA) 2215 PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg)); 2216 if (flg) { 2217 PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize); 2218 b->chunksize = chunksize; 2219 } 2220 #endif 2221 } 2222 PetscOptionsEnd(); 2223 PetscFunctionReturn(PETSC_SUCCESS); 2224 } 2225 2226 /* 2227 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 2228 */ 2229 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 2230 { 2231 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 2232 PetscInt i, m = A->rmap->n; 2233 PetscInt totalslices = a->totalslices; 2234 2235 PetscFunctionBegin; 2236 C->factortype = A->factortype; 2237 c->row = NULL; 2238 c->col = NULL; 2239 c->icol = NULL; 2240 c->reallocs = 0; 2241 C->assembled = PETSC_TRUE; 2242 2243 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2244 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2245 2246 PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen)); 2247 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2248 2249 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2250 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2251 2252 /* allocate the matrix space */ 2253 if (mallocmatspace) { 2254 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2255 2256 c->singlemalloc = PETSC_TRUE; 2257 2258 if (m > 0) { 2259 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2260 if (cpvalues == MAT_COPY_VALUES) { 2261 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2262 } else { 2263 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2264 } 2265 } 2266 } 2267 2268 c->ignorezeroentries = a->ignorezeroentries; 2269 c->roworiented = a->roworiented; 2270 c->nonew = a->nonew; 2271 if (a->diag) { 2272 PetscCall(PetscMalloc1(m, &c->diag)); 2273 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2274 } else c->diag = NULL; 2275 2276 c->solve_work = NULL; 2277 c->saved_values = NULL; 2278 c->idiag = NULL; 2279 c->ssor_work = NULL; 2280 c->keepnonzeropattern = a->keepnonzeropattern; 2281 c->free_val = PETSC_TRUE; 2282 c->free_colidx = PETSC_TRUE; 2283 2284 c->maxallocmat = a->maxallocmat; 2285 c->maxallocrow = a->maxallocrow; 2286 c->rlenmax = a->rlenmax; 2287 c->nz = a->nz; 2288 C->preallocated = PETSC_TRUE; 2289 2290 c->nonzerorowcnt = a->nonzerorowcnt; 2291 C->nonzerostate = A->nonzerostate; 2292 2293 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2294 PetscFunctionReturn(PETSC_SUCCESS); 2295 } 2296 2297 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2298 { 2299 PetscFunctionBegin; 2300 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2301 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2302 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2303 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2304 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2305 PetscFunctionReturn(PETSC_SUCCESS); 2306 } 2307 2308 /*MC 2309 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2310 based on the sliced Ellpack format 2311 2312 Options Database Key: 2313 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2314 2315 Level: beginner 2316 2317 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2318 M*/ 2319 2320 /*MC 2321 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2322 2323 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2324 and `MATMPISELL` otherwise. As a result, for single process communicators, 2325 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2326 for communicators controlling multiple processes. It is recommended that you call both of 2327 the above preallocation routines for simplicity. 2328 2329 Options Database Key: 2330 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2331 2332 Level: beginner 2333 2334 Notes: 2335 This format is only supported for real scalars, double precision, and 32-bit indices (the defaults). 2336 2337 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2338 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2339 2340 Developer Notes: 2341 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2342 2343 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2344 .vb 2345 (2 0 3 4) 2346 Consider the matrix A = (5 0 6 0) 2347 (0 0 7 8) 2348 (0 0 9 9) 2349 2350 symbolically the Ellpack format can be written as 2351 2352 (2 3 4 |) (0 2 3 |) 2353 v = (5 6 0 |) colidx = (0 2 2 |) 2354 -------- --------- 2355 (7 8 |) (2 3 |) 2356 (9 9 |) (2 3 |) 2357 2358 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2359 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2360 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2361 2362 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2363 2364 .ve 2365 2366 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2367 2368 References: 2369 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2370 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2371 2372 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2373 M*/ 2374 2375 /*@C 2376 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2377 2378 Collective 2379 2380 Input Parameters: 2381 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2382 . m - number of rows 2383 . n - number of columns 2384 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided 2385 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL 2386 2387 Output Parameter: 2388 . A - the matrix 2389 2390 Level: intermediate 2391 2392 Notes: 2393 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2394 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2395 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2396 2397 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 2398 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 2399 allocation. 2400 2401 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL` 2402 @*/ 2403 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A) 2404 { 2405 PetscFunctionBegin; 2406 PetscCall(MatCreate(comm, A)); 2407 PetscCall(MatSetSizes(*A, m, n, m, n)); 2408 PetscCall(MatSetType(*A, MATSEQSELL)); 2409 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen)); 2410 PetscFunctionReturn(PETSC_SUCCESS); 2411 } 2412 2413 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2414 { 2415 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2416 PetscInt totalslices = a->totalslices; 2417 2418 PetscFunctionBegin; 2419 /* If the matrix dimensions are not equal,or no of nonzeros */ 2420 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2421 *flg = PETSC_FALSE; 2422 PetscFunctionReturn(PETSC_SUCCESS); 2423 } 2424 /* if the a->colidx are the same */ 2425 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2426 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2427 /* if a->val are the same */ 2428 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2429 PetscFunctionReturn(PETSC_SUCCESS); 2430 } 2431 2432 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2433 { 2434 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2435 2436 PetscFunctionBegin; 2437 a->idiagvalid = PETSC_FALSE; 2438 PetscFunctionReturn(PETSC_SUCCESS); 2439 } 2440 2441 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2442 { 2443 #if defined(PETSC_USE_COMPLEX) 2444 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2445 PetscInt i; 2446 PetscScalar *val = a->val; 2447 2448 PetscFunctionBegin; 2449 for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); } 2450 #if defined(PETSC_HAVE_CUDA) 2451 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 2452 #endif 2453 #else 2454 PetscFunctionBegin; 2455 #endif 2456 PetscFunctionReturn(PETSC_SUCCESS); 2457 } 2458