1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 11 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 12 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 13 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 14 " year = 2018\n" 15 "}\n"; 16 17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 18 19 #include <immintrin.h> 20 21 #if !defined(_MM_SCALE_8) 22 #define _MM_SCALE_8 8 23 #endif 24 25 #if defined(__AVX512F__) 26 /* these do not work 27 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 28 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 29 */ 30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 31 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 32 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 33 vec_vals = _mm512_loadu_pd(aval); \ 34 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 35 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 36 #elif defined(__AVX2__) && defined(__FMA__) 37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 38 vec_vals = _mm256_loadu_pd(aval); \ 39 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 40 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 41 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 42 #endif 43 #endif /* PETSC_HAVE_IMMINTRIN_H */ 44 45 /*@C 46 MatSeqSELLSetPreallocation - For good matrix assembly performance 47 the user should preallocate the matrix storage by setting the parameter `nz` 48 (or the array `nnz`). 49 50 Collective 51 52 Input Parameters: 53 + B - The `MATSEQSELL` matrix 54 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided 55 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 56 57 Level: intermediate 58 59 Notes: 60 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 61 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 62 allocation. 63 64 You can call `MatGetInfo()` to get information on how effective the preallocation was; 65 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 66 You can also run with the option `-info` and look for messages with the string 67 malloc in them to see if additional memory allocation was needed. 68 69 Developer Notes: 70 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 71 entries or columns indices. 72 73 The maximum number of nonzeos in any row should be as accurate as possible. 74 If it is underestimated, you will get bad performance due to reallocation 75 (`MatSeqXSELLReallocateSELL()`). 76 77 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 78 @*/ 79 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 80 { 81 PetscFunctionBegin; 82 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 83 PetscValidType(B, 1); 84 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 85 PetscFunctionReturn(PETSC_SUCCESS); 86 } 87 88 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 89 { 90 Mat_SeqSELL *b; 91 PetscInt i, j, totalslices; 92 #if defined(PETSC_HAVE_CUDA) 93 PetscInt rlenmax = 0; 94 #endif 95 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 96 97 PetscFunctionBegin; 98 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 99 if (maxallocrow == MAT_SKIP_ALLOCATION) { 100 skipallocation = PETSC_TRUE; 101 maxallocrow = 0; 102 } 103 104 PetscCall(PetscLayoutSetUp(B->rmap)); 105 PetscCall(PetscLayoutSetUp(B->cmap)); 106 107 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 108 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 109 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 110 if (rlen) { 111 for (i = 0; i < B->rmap->n; i++) { 112 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 113 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 114 } 115 } 116 117 B->preallocated = PETSC_TRUE; 118 119 b = (Mat_SeqSELL *)B->data; 120 121 if (!b->sliceheight) { /* not set yet */ 122 #if defined(PETSC_HAVE_CUDA) 123 b->sliceheight = 16; 124 #else 125 b->sliceheight = 8; 126 #endif 127 } 128 totalslices = PetscCeilInt(B->rmap->n, b->sliceheight); 129 b->totalslices = totalslices; 130 if (!skipallocation) { 131 if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n)); 132 133 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 134 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 135 } 136 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 137 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 138 else if (maxallocrow < 0) maxallocrow = 1; 139 #if defined(PETSC_HAVE_CUDA) 140 rlenmax = maxallocrow; 141 /* Pad the slice to DEVICE_MEM_ALIGN */ 142 while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++; 143 #endif 144 for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow; 145 } else { 146 #if defined(PETSC_HAVE_CUDA) 147 PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight; 148 #endif 149 maxallocrow = 0; 150 b->sliidx[0] = 0; 151 for (i = 1; i < totalslices; i++) { 152 b->sliidx[i] = 0; 153 for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); } 154 #if defined(PETSC_HAVE_CUDA) 155 rlenmax = PetscMax(b->sliidx[i], rlenmax); 156 /* Pad the slice to DEVICE_MEM_ALIGN */ 157 b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul; 158 #endif 159 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 160 PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i])); 161 } 162 /* last slice */ 163 b->sliidx[totalslices] = 0; 164 for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 165 #if defined(PETSC_HAVE_CUDA) 166 rlenmax = PetscMax(b->sliidx[i], rlenmax); 167 b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul; 168 #endif 169 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 170 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices]; 171 } 172 173 /* allocate space for val, colidx, rlen */ 174 /* FIXME: should B's old memory be unlogged? */ 175 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 176 /* FIXME: assuming an element of the bit array takes 8 bits */ 177 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 178 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 179 PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen)); 180 181 b->singlemalloc = PETSC_TRUE; 182 b->free_val = PETSC_TRUE; 183 b->free_colidx = PETSC_TRUE; 184 } else { 185 b->free_val = PETSC_FALSE; 186 b->free_colidx = PETSC_FALSE; 187 } 188 189 b->nz = 0; 190 b->maxallocrow = maxallocrow; 191 #if defined(PETSC_HAVE_CUDA) 192 b->rlenmax = rlenmax; 193 #else 194 b->rlenmax = maxallocrow; 195 #endif 196 b->maxallocmat = b->sliidx[totalslices]; 197 B->info.nz_unneeded = (double)b->maxallocmat; 198 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 199 PetscFunctionReturn(PETSC_SUCCESS); 200 } 201 202 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 203 { 204 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 205 PetscInt shift; 206 207 PetscFunctionBegin; 208 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 209 if (nz) *nz = a->rlen[row]; 210 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); 211 if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); } 212 if (idx) { 213 PetscInt j; 214 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j]; 215 *idx = a->getrowcols; 216 } 217 if (v) { 218 PetscInt j; 219 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j]; 220 *v = a->getrowvals; 221 } 222 PetscFunctionReturn(PETSC_SUCCESS); 223 } 224 225 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 226 { 227 PetscFunctionBegin; 228 PetscFunctionReturn(PETSC_SUCCESS); 229 } 230 231 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 232 { 233 Mat B; 234 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 235 PetscInt i; 236 237 PetscFunctionBegin; 238 if (reuse == MAT_REUSE_MATRIX) { 239 B = *newmat; 240 PetscCall(MatZeroEntries(B)); 241 } else { 242 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 243 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 244 PetscCall(MatSetType(B, MATSEQAIJ)); 245 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 246 } 247 248 for (i = 0; i < A->rmap->n; i++) { 249 PetscInt nz = 0, *cols = NULL; 250 PetscScalar *vals = NULL; 251 252 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 253 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 254 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 255 } 256 257 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 258 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 259 B->rmap->bs = A->rmap->bs; 260 261 if (reuse == MAT_INPLACE_MATRIX) { 262 PetscCall(MatHeaderReplace(A, &B)); 263 } else { 264 *newmat = B; 265 } 266 PetscFunctionReturn(PETSC_SUCCESS); 267 } 268 269 #include <../src/mat/impls/aij/seq/aij.h> 270 271 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 272 { 273 Mat B; 274 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 275 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 276 const PetscInt *cols; 277 const PetscScalar *vals; 278 279 PetscFunctionBegin; 280 281 if (reuse == MAT_REUSE_MATRIX) { 282 B = *newmat; 283 } else { 284 if (PetscDefined(USE_DEBUG) || !a->ilen) { 285 PetscCall(PetscMalloc1(m, &rowlengths)); 286 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 287 } 288 if (PetscDefined(USE_DEBUG) && a->ilen) { 289 PetscBool eq; 290 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 291 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 292 PetscCall(PetscFree(rowlengths)); 293 rowlengths = a->ilen; 294 } else if (a->ilen) rowlengths = a->ilen; 295 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 296 PetscCall(MatSetSizes(B, m, n, m, n)); 297 PetscCall(MatSetType(B, MATSEQSELL)); 298 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 299 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 300 } 301 302 for (row = 0; row < m; row++) { 303 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 304 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 305 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 306 } 307 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 308 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 309 B->rmap->bs = A->rmap->bs; 310 311 if (reuse == MAT_INPLACE_MATRIX) { 312 PetscCall(MatHeaderReplace(A, &B)); 313 } else { 314 *newmat = B; 315 } 316 PetscFunctionReturn(PETSC_SUCCESS); 317 } 318 319 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 320 { 321 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 322 PetscScalar *y; 323 const PetscScalar *x; 324 const MatScalar *aval = a->val; 325 PetscInt totalslices = a->totalslices; 326 const PetscInt *acolidx = a->colidx; 327 PetscInt i, j; 328 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 329 __m512d vec_x, vec_y, vec_vals; 330 __m256i vec_idx; 331 __mmask8 mask; 332 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 333 __m256i vec_idx2, vec_idx3, vec_idx4; 334 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 335 __m128i vec_idx; 336 __m256d vec_x, vec_y, vec_y2, vec_vals; 337 MatScalar yval; 338 PetscInt r, rows_left, row, nnz_in_row; 339 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 340 __m128d vec_x_tmp; 341 __m256d vec_x, vec_y, vec_y2, vec_vals; 342 MatScalar yval; 343 PetscInt r, rows_left, row, nnz_in_row; 344 #else 345 PetscInt k, sliceheight = a->sliceheight; 346 PetscScalar *sum; 347 #endif 348 349 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 350 #pragma disjoint(*x, *y, *aval) 351 #endif 352 353 PetscFunctionBegin; 354 PetscCall(VecGetArrayRead(xx, &x)); 355 PetscCall(VecGetArray(yy, &y)); 356 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 357 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 358 for (i = 0; i < totalslices; i++) { /* loop over slices */ 359 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 360 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 361 362 vec_y = _mm512_setzero_pd(); 363 vec_y2 = _mm512_setzero_pd(); 364 vec_y3 = _mm512_setzero_pd(); 365 vec_y4 = _mm512_setzero_pd(); 366 367 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 368 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 369 case 3: 370 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 371 acolidx += 8; 372 aval += 8; 373 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 374 acolidx += 8; 375 aval += 8; 376 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 377 acolidx += 8; 378 aval += 8; 379 j += 3; 380 break; 381 case 2: 382 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 383 acolidx += 8; 384 aval += 8; 385 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 386 acolidx += 8; 387 aval += 8; 388 j += 2; 389 break; 390 case 1: 391 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 392 acolidx += 8; 393 aval += 8; 394 j += 1; 395 break; 396 } 397 #pragma novector 398 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 399 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 400 acolidx += 8; 401 aval += 8; 402 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 403 acolidx += 8; 404 aval += 8; 405 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 406 acolidx += 8; 407 aval += 8; 408 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 409 acolidx += 8; 410 aval += 8; 411 } 412 413 vec_y = _mm512_add_pd(vec_y, vec_y2); 414 vec_y = _mm512_add_pd(vec_y, vec_y3); 415 vec_y = _mm512_add_pd(vec_y, vec_y4); 416 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 417 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 418 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 419 } else { 420 _mm512_storeu_pd(&y[8 * i], vec_y); 421 } 422 } 423 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 424 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 425 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 426 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 427 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 428 429 /* last slice may have padding rows. Don't use vectorization. */ 430 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 431 rows_left = A->rmap->n - 8 * i; 432 for (r = 0; r < rows_left; ++r) { 433 yval = (MatScalar)0; 434 row = 8 * i + r; 435 nnz_in_row = a->rlen[row]; 436 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 437 y[row] = yval; 438 } 439 break; 440 } 441 442 vec_y = _mm256_setzero_pd(); 443 vec_y2 = _mm256_setzero_pd(); 444 445 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 446 #pragma novector 447 #pragma unroll(2) 448 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 449 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 450 aval += 4; 451 acolidx += 4; 452 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 453 aval += 4; 454 acolidx += 4; 455 } 456 457 _mm256_storeu_pd(y + i * 8, vec_y); 458 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 459 } 460 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 461 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 462 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 463 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 464 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 465 466 vec_y = _mm256_setzero_pd(); 467 vec_y2 = _mm256_setzero_pd(); 468 469 /* last slice may have padding rows. Don't use vectorization. */ 470 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 471 rows_left = A->rmap->n - 8 * i; 472 for (r = 0; r < rows_left; ++r) { 473 yval = (MatScalar)0; 474 row = 8 * i + r; 475 nnz_in_row = a->rlen[row]; 476 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 477 y[row] = yval; 478 } 479 break; 480 } 481 482 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 483 #pragma novector 484 #pragma unroll(2) 485 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 486 vec_vals = _mm256_loadu_pd(aval); 487 vec_x_tmp = _mm_setzero_pd(); 488 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 489 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 490 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 491 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 492 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 493 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 494 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 495 aval += 4; 496 497 vec_vals = _mm256_loadu_pd(aval); 498 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 499 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 500 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 501 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 502 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 503 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 504 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 505 aval += 4; 506 } 507 508 _mm256_storeu_pd(y + i * 8, vec_y); 509 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 510 } 511 #else 512 PetscCall(PetscMalloc1(sliceheight, &sum)); 513 for (i = 0; i < totalslices; i++) { /* loop over slices */ 514 for (j = 0; j < sliceheight; j++) { 515 sum[j] = 0.0; 516 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 517 } 518 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */ 519 for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j]; 520 } else { 521 for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j]; 522 } 523 } 524 PetscCall(PetscFree(sum)); 525 #endif 526 527 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 528 PetscCall(VecRestoreArrayRead(xx, &x)); 529 PetscCall(VecRestoreArray(yy, &y)); 530 PetscFunctionReturn(PETSC_SUCCESS); 531 } 532 533 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 534 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 535 { 536 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 537 PetscScalar *y, *z; 538 const PetscScalar *x; 539 const MatScalar *aval = a->val; 540 PetscInt totalslices = a->totalslices; 541 const PetscInt *acolidx = a->colidx; 542 PetscInt i, j; 543 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 544 __m512d vec_x, vec_y, vec_vals; 545 __m256i vec_idx; 546 __mmask8 mask = 0; 547 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 548 __m256i vec_idx2, vec_idx3, vec_idx4; 549 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 550 __m128d vec_x_tmp; 551 __m256d vec_x, vec_y, vec_y2, vec_vals; 552 MatScalar yval; 553 PetscInt r, row, nnz_in_row; 554 #else 555 PetscInt k, sliceheight = a->sliceheight; 556 PetscScalar *sum; 557 #endif 558 559 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 560 #pragma disjoint(*x, *y, *aval) 561 #endif 562 563 PetscFunctionBegin; 564 if (!a->nz) { 565 PetscCall(VecCopy(yy, zz)); 566 PetscFunctionReturn(PETSC_SUCCESS); 567 } 568 PetscCall(VecGetArrayRead(xx, &x)); 569 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 570 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 571 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 572 for (i = 0; i < totalslices; i++) { /* loop over slices */ 573 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 574 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 575 576 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 577 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 578 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 579 } else { 580 vec_y = _mm512_loadu_pd(&y[8 * i]); 581 } 582 vec_y2 = _mm512_setzero_pd(); 583 vec_y3 = _mm512_setzero_pd(); 584 vec_y4 = _mm512_setzero_pd(); 585 586 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 587 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 588 case 3: 589 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 590 acolidx += 8; 591 aval += 8; 592 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 593 acolidx += 8; 594 aval += 8; 595 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 596 acolidx += 8; 597 aval += 8; 598 j += 3; 599 break; 600 case 2: 601 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 602 acolidx += 8; 603 aval += 8; 604 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 605 acolidx += 8; 606 aval += 8; 607 j += 2; 608 break; 609 case 1: 610 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 611 acolidx += 8; 612 aval += 8; 613 j += 1; 614 break; 615 } 616 #pragma novector 617 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 618 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 619 acolidx += 8; 620 aval += 8; 621 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 622 acolidx += 8; 623 aval += 8; 624 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 625 acolidx += 8; 626 aval += 8; 627 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 628 acolidx += 8; 629 aval += 8; 630 } 631 632 vec_y = _mm512_add_pd(vec_y, vec_y2); 633 vec_y = _mm512_add_pd(vec_y, vec_y3); 634 vec_y = _mm512_add_pd(vec_y, vec_y4); 635 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 636 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 637 } else { 638 _mm512_storeu_pd(&z[8 * i], vec_y); 639 } 640 } 641 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 642 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 643 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 644 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 645 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 646 647 /* last slice may have padding rows. Don't use vectorization. */ 648 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 649 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 650 row = 8 * i + r; 651 yval = (MatScalar)0.0; 652 nnz_in_row = a->rlen[row]; 653 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 654 z[row] = y[row] + yval; 655 } 656 break; 657 } 658 659 vec_y = _mm256_loadu_pd(y + 8 * i); 660 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 661 662 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 663 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 664 vec_vals = _mm256_loadu_pd(aval); 665 vec_x_tmp = _mm_setzero_pd(); 666 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 667 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 668 vec_x = _mm256_setzero_pd(); 669 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 670 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 671 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 672 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 673 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 674 aval += 4; 675 676 vec_vals = _mm256_loadu_pd(aval); 677 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 678 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 679 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 680 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 681 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 682 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 683 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 684 aval += 4; 685 } 686 687 _mm256_storeu_pd(z + i * 8, vec_y); 688 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 689 } 690 #else 691 PetscCall(PetscMalloc1(sliceheight, &sum)); 692 for (i = 0; i < totalslices; i++) { /* loop over slices */ 693 for (j = 0; j < sliceheight; j++) { 694 sum[j] = 0.0; 695 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 696 } 697 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 698 for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 699 } else { 700 for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 701 } 702 } 703 PetscCall(PetscFree(sum)); 704 #endif 705 706 PetscCall(PetscLogFlops(2.0 * a->nz)); 707 PetscCall(VecRestoreArrayRead(xx, &x)); 708 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 709 PetscFunctionReturn(PETSC_SUCCESS); 710 } 711 712 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 713 { 714 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 715 PetscScalar *y; 716 const PetscScalar *x; 717 const MatScalar *aval = a->val; 718 const PetscInt *acolidx = a->colidx; 719 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight; 720 721 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 722 #pragma disjoint(*x, *y, *aval) 723 #endif 724 725 PetscFunctionBegin; 726 if (A->symmetric == PETSC_BOOL3_TRUE) { 727 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 728 PetscFunctionReturn(PETSC_SUCCESS); 729 } 730 if (zz != yy) PetscCall(VecCopy(zz, yy)); 731 732 if (a->nz) { 733 PetscCall(VecGetArrayRead(xx, &x)); 734 PetscCall(VecGetArray(yy, &y)); 735 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 736 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 737 for (r = 0; r < (A->rmap->n % sliceheight); ++r) { 738 row = sliceheight * i + r; 739 nnz_in_row = a->rlen[row]; 740 for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row]; 741 } 742 break; 743 } 744 for (r = 0; r < sliceheight; ++r) 745 for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r]; 746 } 747 PetscCall(PetscLogFlops(2.0 * a->nz)); 748 PetscCall(VecRestoreArrayRead(xx, &x)); 749 PetscCall(VecRestoreArray(yy, &y)); 750 } 751 PetscFunctionReturn(PETSC_SUCCESS); 752 } 753 754 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 755 { 756 PetscFunctionBegin; 757 if (A->symmetric == PETSC_BOOL3_TRUE) { 758 PetscCall(MatMult_SeqSELL(A, xx, yy)); 759 } else { 760 PetscCall(VecSet(yy, 0.0)); 761 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 762 } 763 PetscFunctionReturn(PETSC_SUCCESS); 764 } 765 766 /* 767 Checks for missing diagonals 768 */ 769 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 770 { 771 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 772 PetscInt *diag, i; 773 774 PetscFunctionBegin; 775 *missing = PETSC_FALSE; 776 if (A->rmap->n > 0 && !(a->colidx)) { 777 *missing = PETSC_TRUE; 778 if (d) *d = 0; 779 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 780 } else { 781 diag = a->diag; 782 for (i = 0; i < A->rmap->n; i++) { 783 if (diag[i] == -1) { 784 *missing = PETSC_TRUE; 785 if (d) *d = i; 786 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 787 break; 788 } 789 } 790 } 791 PetscFunctionReturn(PETSC_SUCCESS); 792 } 793 794 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 795 { 796 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 797 PetscInt i, j, m = A->rmap->n, shift; 798 799 PetscFunctionBegin; 800 if (!a->diag) { 801 PetscCall(PetscMalloc1(m, &a->diag)); 802 a->free_diag = PETSC_TRUE; 803 } 804 for (i = 0; i < m; i++) { /* loop over rows */ 805 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 806 a->diag[i] = -1; 807 for (j = 0; j < a->rlen[i]; j++) { 808 if (a->colidx[shift + a->sliceheight * j] == i) { 809 a->diag[i] = shift + a->sliceheight * j; 810 break; 811 } 812 } 813 } 814 PetscFunctionReturn(PETSC_SUCCESS); 815 } 816 817 /* 818 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 819 */ 820 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 821 { 822 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 823 PetscInt i, *diag, m = A->rmap->n; 824 MatScalar *val = a->val; 825 PetscScalar *idiag, *mdiag; 826 827 PetscFunctionBegin; 828 if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS); 829 PetscCall(MatMarkDiagonal_SeqSELL(A)); 830 diag = a->diag; 831 if (!a->idiag) { 832 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 833 val = a->val; 834 } 835 mdiag = a->mdiag; 836 idiag = a->idiag; 837 838 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 839 for (i = 0; i < m; i++) { 840 mdiag[i] = val[diag[i]]; 841 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 842 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 843 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 844 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 845 A->factorerror_zeropivot_value = 0.0; 846 A->factorerror_zeropivot_row = i; 847 } 848 idiag[i] = 1.0 / val[diag[i]]; 849 } 850 PetscCall(PetscLogFlops(m)); 851 } else { 852 for (i = 0; i < m; i++) { 853 mdiag[i] = val[diag[i]]; 854 idiag[i] = omega / (fshift + val[diag[i]]); 855 } 856 PetscCall(PetscLogFlops(2.0 * m)); 857 } 858 a->idiagvalid = PETSC_TRUE; 859 PetscFunctionReturn(PETSC_SUCCESS); 860 } 861 862 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 863 { 864 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 865 866 PetscFunctionBegin; 867 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 868 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 869 PetscFunctionReturn(PETSC_SUCCESS); 870 } 871 872 PetscErrorCode MatDestroy_SeqSELL(Mat A) 873 { 874 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 875 876 PetscFunctionBegin; 877 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 878 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 879 PetscCall(ISDestroy(&a->row)); 880 PetscCall(ISDestroy(&a->col)); 881 PetscCall(PetscFree(a->diag)); 882 PetscCall(PetscFree(a->rlen)); 883 PetscCall(PetscFree(a->sliidx)); 884 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 885 PetscCall(PetscFree(a->solve_work)); 886 PetscCall(ISDestroy(&a->icol)); 887 PetscCall(PetscFree(a->saved_values)); 888 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 889 PetscCall(PetscFree(A->data)); 890 #if defined(PETSC_HAVE_CUDA) 891 PetscCall(PetscFree(a->chunk_slice_map)); 892 #endif 893 894 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 895 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 896 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 897 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 898 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 899 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 900 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 901 #if defined(PETSC_HAVE_CUDA) 902 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL)); 903 #endif 904 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL)); 905 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL)); 906 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL)); 907 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL)); 908 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL)); 909 PetscFunctionReturn(PETSC_SUCCESS); 910 } 911 912 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 913 { 914 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 915 916 PetscFunctionBegin; 917 switch (op) { 918 case MAT_ROW_ORIENTED: 919 a->roworiented = flg; 920 break; 921 case MAT_KEEP_NONZERO_PATTERN: 922 a->keepnonzeropattern = flg; 923 break; 924 case MAT_NEW_NONZERO_LOCATIONS: 925 a->nonew = (flg ? 0 : 1); 926 break; 927 case MAT_NEW_NONZERO_LOCATION_ERR: 928 a->nonew = (flg ? -1 : 0); 929 break; 930 case MAT_NEW_NONZERO_ALLOCATION_ERR: 931 a->nonew = (flg ? -2 : 0); 932 break; 933 case MAT_UNUSED_NONZERO_LOCATION_ERR: 934 a->nounused = (flg ? -1 : 0); 935 break; 936 case MAT_FORCE_DIAGONAL_ENTRIES: 937 case MAT_IGNORE_OFF_PROC_ENTRIES: 938 case MAT_USE_HASH_TABLE: 939 case MAT_SORTED_FULL: 940 PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); 941 break; 942 case MAT_SPD: 943 case MAT_SYMMETRIC: 944 case MAT_STRUCTURALLY_SYMMETRIC: 945 case MAT_HERMITIAN: 946 case MAT_SYMMETRY_ETERNAL: 947 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 948 case MAT_SPD_ETERNAL: 949 /* These options are handled directly by MatSetOption() */ 950 break; 951 default: 952 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 953 } 954 PetscFunctionReturn(PETSC_SUCCESS); 955 } 956 957 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 958 { 959 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 960 PetscInt i, j, n, shift; 961 PetscScalar *x, zero = 0.0; 962 963 PetscFunctionBegin; 964 PetscCall(VecGetLocalSize(v, &n)); 965 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 966 967 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 968 PetscInt *diag = a->diag; 969 PetscCall(VecGetArray(v, &x)); 970 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 971 PetscCall(VecRestoreArray(v, &x)); 972 PetscFunctionReturn(PETSC_SUCCESS); 973 } 974 975 PetscCall(VecSet(v, zero)); 976 PetscCall(VecGetArray(v, &x)); 977 for (i = 0; i < n; i++) { /* loop over rows */ 978 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 979 x[i] = 0; 980 for (j = 0; j < a->rlen[i]; j++) { 981 if (a->colidx[shift + a->sliceheight * j] == i) { 982 x[i] = a->val[shift + a->sliceheight * j]; 983 break; 984 } 985 } 986 } 987 PetscCall(VecRestoreArray(v, &x)); 988 PetscFunctionReturn(PETSC_SUCCESS); 989 } 990 991 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 992 { 993 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 994 const PetscScalar *l, *r; 995 PetscInt i, j, m, n, row; 996 997 PetscFunctionBegin; 998 if (ll) { 999 /* The local size is used so that VecMPI can be passed to this routine 1000 by MatDiagonalScale_MPISELL */ 1001 PetscCall(VecGetLocalSize(ll, &m)); 1002 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1003 PetscCall(VecGetArrayRead(ll, &l)); 1004 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1005 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1006 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1007 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row]; 1008 } 1009 } else { 1010 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; } 1011 } 1012 } 1013 PetscCall(VecRestoreArrayRead(ll, &l)); 1014 PetscCall(PetscLogFlops(a->nz)); 1015 } 1016 if (rr) { 1017 PetscCall(VecGetLocalSize(rr, &n)); 1018 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1019 PetscCall(VecGetArrayRead(rr, &r)); 1020 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1021 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1022 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) { 1023 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]]; 1024 } 1025 } else { 1026 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 1027 } 1028 } 1029 PetscCall(VecRestoreArrayRead(rr, &r)); 1030 PetscCall(PetscLogFlops(a->nz)); 1031 } 1032 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1033 #if defined(PETSC_HAVE_CUDA) 1034 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1035 #endif 1036 PetscFunctionReturn(PETSC_SUCCESS); 1037 } 1038 1039 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1040 { 1041 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1042 PetscInt *cp, i, k, low, high, t, row, col, l; 1043 PetscInt shift; 1044 MatScalar *vp; 1045 1046 PetscFunctionBegin; 1047 for (k = 0; k < m; k++) { /* loop over requested rows */ 1048 row = im[k]; 1049 if (row < 0) continue; 1050 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1051 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */ 1052 cp = a->colidx + shift; /* pointer to the row */ 1053 vp = a->val + shift; /* pointer to the row */ 1054 for (l = 0; l < n; l++) { /* loop over requested columns */ 1055 col = in[l]; 1056 if (col < 0) continue; 1057 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1058 high = a->rlen[row]; 1059 low = 0; /* assume unsorted */ 1060 while (high - low > 5) { 1061 t = (low + high) / 2; 1062 if (*(cp + a->sliceheight * t) > col) high = t; 1063 else low = t; 1064 } 1065 for (i = low; i < high; i++) { 1066 if (*(cp + a->sliceheight * i) > col) break; 1067 if (*(cp + a->sliceheight * i) == col) { 1068 *v++ = *(vp + a->sliceheight * i); 1069 goto finished; 1070 } 1071 } 1072 *v++ = 0.0; 1073 finished:; 1074 } 1075 } 1076 PetscFunctionReturn(PETSC_SUCCESS); 1077 } 1078 1079 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1080 { 1081 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1082 PetscInt i, j, m = A->rmap->n, shift; 1083 const char *name; 1084 PetscViewerFormat format; 1085 1086 PetscFunctionBegin; 1087 PetscCall(PetscViewerGetFormat(viewer, &format)); 1088 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1089 PetscInt nofinalvalue = 0; 1090 /* 1091 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1092 nofinalvalue = 1; 1093 } 1094 */ 1095 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1096 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1097 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1098 #if defined(PETSC_USE_COMPLEX) 1099 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1100 #else 1101 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1102 #endif 1103 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1104 1105 for (i = 0; i < m; i++) { 1106 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1107 for (j = 0; j < a->rlen[i]; j++) { 1108 #if defined(PETSC_USE_COMPLEX) 1109 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1110 #else 1111 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j])); 1112 #endif 1113 } 1114 } 1115 /* 1116 if (nofinalvalue) { 1117 #if defined(PETSC_USE_COMPLEX) 1118 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1119 #else 1120 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1121 #endif 1122 } 1123 */ 1124 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1125 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1126 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1127 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1128 PetscFunctionReturn(PETSC_SUCCESS); 1129 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1130 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1131 for (i = 0; i < m; i++) { 1132 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1133 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1134 for (j = 0; j < a->rlen[i]; j++) { 1135 #if defined(PETSC_USE_COMPLEX) 1136 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1137 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1138 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1139 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1140 } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1141 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1142 } 1143 #else 1144 if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1145 #endif 1146 } 1147 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1148 } 1149 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1150 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1151 PetscInt cnt = 0, jcnt; 1152 PetscScalar value; 1153 #if defined(PETSC_USE_COMPLEX) 1154 PetscBool realonly = PETSC_TRUE; 1155 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1156 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1157 realonly = PETSC_FALSE; 1158 break; 1159 } 1160 } 1161 #endif 1162 1163 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1164 for (i = 0; i < m; i++) { 1165 jcnt = 0; 1166 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1167 for (j = 0; j < A->cmap->n; j++) { 1168 if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) { 1169 value = a->val[cnt++]; 1170 jcnt++; 1171 } else { 1172 value = 0.0; 1173 } 1174 #if defined(PETSC_USE_COMPLEX) 1175 if (realonly) { 1176 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1177 } else { 1178 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1179 } 1180 #else 1181 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1182 #endif 1183 } 1184 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1185 } 1186 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1187 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1188 PetscInt fshift = 1; 1189 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1190 #if defined(PETSC_USE_COMPLEX) 1191 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1192 #else 1193 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1194 #endif 1195 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1196 for (i = 0; i < m; i++) { 1197 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1198 for (j = 0; j < a->rlen[i]; j++) { 1199 #if defined(PETSC_USE_COMPLEX) 1200 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1201 #else 1202 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j])); 1203 #endif 1204 } 1205 } 1206 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1207 } else if (format == PETSC_VIEWER_NATIVE) { 1208 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1209 PetscInt row; 1210 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1211 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1212 #if defined(PETSC_USE_COMPLEX) 1213 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1214 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1215 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1216 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1217 } else { 1218 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1219 } 1220 #else 1221 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j])); 1222 #endif 1223 } 1224 } 1225 } else { 1226 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1227 if (A->factortype) { 1228 for (i = 0; i < m; i++) { 1229 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1230 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1231 /* L part */ 1232 for (j = shift; j < a->diag[i]; j += a->sliceheight) { 1233 #if defined(PETSC_USE_COMPLEX) 1234 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) { 1235 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1236 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) { 1237 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1238 } else { 1239 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1240 } 1241 #else 1242 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1243 #endif 1244 } 1245 /* diagonal */ 1246 j = a->diag[i]; 1247 #if defined(PETSC_USE_COMPLEX) 1248 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1249 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1250 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1251 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1252 } else { 1253 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1254 } 1255 #else 1256 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1257 #endif 1258 1259 /* U part */ 1260 for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) { 1261 #if defined(PETSC_USE_COMPLEX) 1262 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1263 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1264 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1265 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1266 } else { 1267 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1268 } 1269 #else 1270 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1271 #endif 1272 } 1273 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1274 } 1275 } else { 1276 for (i = 0; i < m; i++) { 1277 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1278 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1279 for (j = 0; j < a->rlen[i]; j++) { 1280 #if defined(PETSC_USE_COMPLEX) 1281 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1282 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1283 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1284 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1285 } else { 1286 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1287 } 1288 #else 1289 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1290 #endif 1291 } 1292 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1293 } 1294 } 1295 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1296 } 1297 PetscCall(PetscViewerFlush(viewer)); 1298 PetscFunctionReturn(PETSC_SUCCESS); 1299 } 1300 1301 #include <petscdraw.h> 1302 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1303 { 1304 Mat A = (Mat)Aa; 1305 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1306 PetscInt i, j, m = A->rmap->n, shift; 1307 int color; 1308 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1309 PetscViewer viewer; 1310 PetscViewerFormat format; 1311 1312 PetscFunctionBegin; 1313 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1314 PetscCall(PetscViewerGetFormat(viewer, &format)); 1315 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1316 1317 /* loop over matrix elements drawing boxes */ 1318 1319 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1320 PetscDrawCollectiveBegin(draw); 1321 /* Blue for negative, Cyan for zero and Red for positive */ 1322 color = PETSC_DRAW_BLUE; 1323 for (i = 0; i < m; i++) { 1324 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1325 y_l = m - i - 1.0; 1326 y_r = y_l + 1.0; 1327 for (j = 0; j < a->rlen[i]; j++) { 1328 x_l = a->colidx[shift + a->sliceheight * j]; 1329 x_r = x_l + 1.0; 1330 if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue; 1331 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1332 } 1333 } 1334 color = PETSC_DRAW_CYAN; 1335 for (i = 0; i < m; i++) { 1336 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1337 y_l = m - i - 1.0; 1338 y_r = y_l + 1.0; 1339 for (j = 0; j < a->rlen[i]; j++) { 1340 x_l = a->colidx[shift + a->sliceheight * j]; 1341 x_r = x_l + 1.0; 1342 if (a->val[shift + a->sliceheight * j] != 0.) continue; 1343 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1344 } 1345 } 1346 color = PETSC_DRAW_RED; 1347 for (i = 0; i < m; i++) { 1348 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1349 y_l = m - i - 1.0; 1350 y_r = y_l + 1.0; 1351 for (j = 0; j < a->rlen[i]; j++) { 1352 x_l = a->colidx[shift + a->sliceheight * j]; 1353 x_r = x_l + 1.0; 1354 if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue; 1355 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1356 } 1357 } 1358 PetscDrawCollectiveEnd(draw); 1359 } else { 1360 /* use contour shading to indicate magnitude of values */ 1361 /* first determine max of all nonzero values */ 1362 PetscReal minv = 0.0, maxv = 0.0; 1363 PetscInt count = 0; 1364 PetscDraw popup; 1365 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1366 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1367 } 1368 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1369 PetscCall(PetscDrawGetPopup(draw, &popup)); 1370 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1371 1372 PetscDrawCollectiveBegin(draw); 1373 for (i = 0; i < m; i++) { 1374 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1375 y_l = m - i - 1.0; 1376 y_r = y_l + 1.0; 1377 for (j = 0; j < a->rlen[i]; j++) { 1378 x_l = a->colidx[shift + a->sliceheight * j]; 1379 x_r = x_l + 1.0; 1380 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1381 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1382 count++; 1383 } 1384 } 1385 PetscDrawCollectiveEnd(draw); 1386 } 1387 PetscFunctionReturn(PETSC_SUCCESS); 1388 } 1389 1390 #include <petscdraw.h> 1391 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1392 { 1393 PetscDraw draw; 1394 PetscReal xr, yr, xl, yl, h, w; 1395 PetscBool isnull; 1396 1397 PetscFunctionBegin; 1398 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1399 PetscCall(PetscDrawIsNull(draw, &isnull)); 1400 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1401 1402 xr = A->cmap->n; 1403 yr = A->rmap->n; 1404 h = yr / 10.0; 1405 w = xr / 10.0; 1406 xr += w; 1407 yr += h; 1408 xl = -w; 1409 yl = -h; 1410 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1411 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1412 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1413 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1414 PetscCall(PetscDrawSave(draw)); 1415 PetscFunctionReturn(PETSC_SUCCESS); 1416 } 1417 1418 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1419 { 1420 PetscBool iascii, isbinary, isdraw; 1421 1422 PetscFunctionBegin; 1423 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1424 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1425 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1426 if (iascii) { 1427 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1428 } else if (isbinary) { 1429 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1430 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1431 PetscFunctionReturn(PETSC_SUCCESS); 1432 } 1433 1434 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1435 { 1436 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1437 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1438 MatScalar *vp; 1439 #if defined(PETSC_HAVE_CUDA) 1440 PetscInt totalchunks = 0; 1441 #endif 1442 1443 PetscFunctionBegin; 1444 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1445 /* To do: compress out the unused elements */ 1446 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1447 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1448 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1449 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1450 a->nonzerorowcnt = 0; 1451 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1452 for (i = 0; i < a->totalslices; ++i) { 1453 shift = a->sliidx[i]; /* starting index of the slice */ 1454 cp = a->colidx + shift; /* pointer to the column indices of the slice */ 1455 vp = a->val + shift; /* pointer to the nonzero values of the slice */ 1456 for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */ 1457 row = a->sliceheight * i + row_in_slice; 1458 nrow = a->rlen[row]; /* number of nonzeros in row */ 1459 /* 1460 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1461 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1462 */ 1463 lastcol = 0; 1464 if (nrow > 0) { /* nonempty row */ 1465 a->nonzerorowcnt++; 1466 lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1467 } else if (!row_in_slice) { /* first row of the correct slice is empty */ 1468 for (j = 1; j < a->sliceheight; j++) { 1469 if (a->rlen[a->sliceheight * i + j]) { 1470 lastcol = cp[j]; 1471 break; 1472 } 1473 } 1474 } else { 1475 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1476 } 1477 1478 for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) { 1479 cp[a->sliceheight * k + row_in_slice] = lastcol; 1480 vp[a->sliceheight * k + row_in_slice] = (MatScalar)0; 1481 } 1482 } 1483 } 1484 1485 A->info.mallocs += a->reallocs; 1486 a->reallocs = 0; 1487 1488 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1489 #if defined(PETSC_HAVE_CUDA) 1490 if (!a->chunksize && a->totalslices) { 1491 a->chunksize = 64; 1492 while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2; 1493 totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize; 1494 } 1495 if (totalchunks != a->totalchunks) { 1496 PetscCall(PetscFree(a->chunk_slice_map)); 1497 PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map)); 1498 a->totalchunks = totalchunks; 1499 } 1500 j = 0; 1501 for (i = 0; i < totalchunks; i++) { 1502 while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++; 1503 a->chunk_slice_map[i] = j; 1504 } 1505 #endif 1506 PetscFunctionReturn(PETSC_SUCCESS); 1507 } 1508 1509 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1510 { 1511 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1512 1513 PetscFunctionBegin; 1514 info->block_size = 1.0; 1515 info->nz_allocated = a->maxallocmat; 1516 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1517 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1518 info->assemblies = A->num_ass; 1519 info->mallocs = A->info.mallocs; 1520 info->memory = 0; /* REVIEW ME */ 1521 if (A->factortype) { 1522 info->fill_ratio_given = A->info.fill_ratio_given; 1523 info->fill_ratio_needed = A->info.fill_ratio_needed; 1524 info->factor_mallocs = A->info.factor_mallocs; 1525 } else { 1526 info->fill_ratio_given = 0; 1527 info->fill_ratio_needed = 0; 1528 info->factor_mallocs = 0; 1529 } 1530 PetscFunctionReturn(PETSC_SUCCESS); 1531 } 1532 1533 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1534 { 1535 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1536 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1537 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1538 MatScalar *vp, value; 1539 #if defined(PETSC_HAVE_CUDA) 1540 PetscBool inserted = PETSC_FALSE; 1541 PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight; 1542 #endif 1543 1544 PetscFunctionBegin; 1545 for (k = 0; k < m; k++) { /* loop over added rows */ 1546 row = im[k]; 1547 if (row < 0) continue; 1548 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1549 shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */ 1550 cp = a->colidx + shift; /* pointer to the row */ 1551 vp = a->val + shift; /* pointer to the row */ 1552 nrow = a->rlen[row]; 1553 low = 0; 1554 high = nrow; 1555 1556 for (l = 0; l < n; l++) { /* loop over added columns */ 1557 col = in[l]; 1558 if (col < 0) continue; 1559 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1560 if (a->roworiented) { 1561 value = v[l + k * n]; 1562 } else { 1563 value = v[k + l * m]; 1564 } 1565 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1566 1567 /* search in this row for the specified column, i indicates the column to be set */ 1568 if (col <= lastcol) low = 0; 1569 else high = nrow; 1570 lastcol = col; 1571 while (high - low > 5) { 1572 t = (low + high) / 2; 1573 if (*(cp + a->sliceheight * t) > col) high = t; 1574 else low = t; 1575 } 1576 for (i = low; i < high; i++) { 1577 if (*(cp + a->sliceheight * i) > col) break; 1578 if (*(cp + a->sliceheight * i) == col) { 1579 if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value; 1580 else *(vp + a->sliceheight * i) = value; 1581 #if defined(PETSC_HAVE_CUDA) 1582 inserted = PETSC_TRUE; 1583 #endif 1584 low = i + 1; 1585 goto noinsert; 1586 } 1587 } 1588 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1589 if (nonew == 1) goto noinsert; 1590 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1591 #if defined(PETSC_HAVE_CUDA) 1592 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul); 1593 #else 1594 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1595 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1); 1596 #endif 1597 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1598 for (ii = nrow - 1; ii >= i; ii--) { 1599 *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); 1600 *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); 1601 } 1602 a->rlen[row]++; 1603 *(cp + a->sliceheight * i) = col; 1604 *(vp + a->sliceheight * i) = value; 1605 a->nz++; 1606 A->nonzerostate++; 1607 #if defined(PETSC_HAVE_CUDA) 1608 inserted = PETSC_TRUE; 1609 #endif 1610 low = i + 1; 1611 high++; 1612 nrow++; 1613 noinsert:; 1614 } 1615 a->rlen[row] = nrow; 1616 } 1617 #if defined(PETSC_HAVE_CUDA) 1618 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU; 1619 #endif 1620 PetscFunctionReturn(PETSC_SUCCESS); 1621 } 1622 1623 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1624 { 1625 PetscFunctionBegin; 1626 /* If the two matrices have the same copy implementation, use fast copy. */ 1627 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1628 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1629 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1630 1631 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1632 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1633 } else { 1634 PetscCall(MatCopy_Basic(A, B, str)); 1635 } 1636 PetscFunctionReturn(PETSC_SUCCESS); 1637 } 1638 1639 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1640 { 1641 PetscFunctionBegin; 1642 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1643 PetscFunctionReturn(PETSC_SUCCESS); 1644 } 1645 1646 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1647 { 1648 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1649 1650 PetscFunctionBegin; 1651 *array = a->val; 1652 PetscFunctionReturn(PETSC_SUCCESS); 1653 } 1654 1655 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1656 { 1657 PetscFunctionBegin; 1658 PetscFunctionReturn(PETSC_SUCCESS); 1659 } 1660 1661 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1662 { 1663 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1664 PetscInt i; 1665 MatScalar *aval = a->val; 1666 1667 PetscFunctionBegin; 1668 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]); 1669 #if defined(PETSC_HAVE_CUDA) 1670 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1671 #endif 1672 PetscFunctionReturn(PETSC_SUCCESS); 1673 } 1674 1675 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1676 { 1677 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1678 PetscInt i; 1679 MatScalar *aval = a->val; 1680 1681 PetscFunctionBegin; 1682 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1683 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1684 #if defined(PETSC_HAVE_CUDA) 1685 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1686 #endif 1687 PetscFunctionReturn(PETSC_SUCCESS); 1688 } 1689 1690 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1691 { 1692 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1693 MatScalar *aval = a->val; 1694 PetscScalar oalpha = alpha; 1695 PetscBLASInt one = 1, size; 1696 1697 PetscFunctionBegin; 1698 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1699 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1700 PetscCall(PetscLogFlops(a->nz)); 1701 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1702 #if defined(PETSC_HAVE_CUDA) 1703 if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU; 1704 #endif 1705 PetscFunctionReturn(PETSC_SUCCESS); 1706 } 1707 1708 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1709 { 1710 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1711 1712 PetscFunctionBegin; 1713 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1714 PetscCall(MatShift_Basic(Y, a)); 1715 PetscFunctionReturn(PETSC_SUCCESS); 1716 } 1717 1718 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1719 { 1720 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1721 PetscScalar *x, sum, *t; 1722 const MatScalar *idiag = NULL, *mdiag; 1723 const PetscScalar *b, *xb; 1724 PetscInt n, m = A->rmap->n, i, j, shift; 1725 const PetscInt *diag; 1726 1727 PetscFunctionBegin; 1728 its = its * lits; 1729 1730 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1731 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1732 a->fshift = fshift; 1733 a->omega = omega; 1734 1735 diag = a->diag; 1736 t = a->ssor_work; 1737 idiag = a->idiag; 1738 mdiag = a->mdiag; 1739 1740 PetscCall(VecGetArray(xx, &x)); 1741 PetscCall(VecGetArrayRead(bb, &b)); 1742 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1743 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1744 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1745 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1746 1747 if (flag & SOR_ZERO_INITIAL_GUESS) { 1748 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1749 for (i = 0; i < m; i++) { 1750 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1751 sum = b[i]; 1752 n = (diag[i] - shift) / a->sliceheight; 1753 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1754 t[i] = sum; 1755 x[i] = sum * idiag[i]; 1756 } 1757 xb = t; 1758 PetscCall(PetscLogFlops(a->nz)); 1759 } else xb = b; 1760 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1761 for (i = m - 1; i >= 0; i--) { 1762 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1763 sum = xb[i]; 1764 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1765 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1766 if (xb == b) { 1767 x[i] = sum * idiag[i]; 1768 } else { 1769 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1770 } 1771 } 1772 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1773 } 1774 its--; 1775 } 1776 while (its--) { 1777 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1778 for (i = 0; i < m; i++) { 1779 /* lower */ 1780 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1781 sum = b[i]; 1782 n = (diag[i] - shift) / a->sliceheight; 1783 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1784 t[i] = sum; /* save application of the lower-triangular part */ 1785 /* upper */ 1786 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1787 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1788 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1789 } 1790 xb = t; 1791 PetscCall(PetscLogFlops(2.0 * a->nz)); 1792 } else xb = b; 1793 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1794 for (i = m - 1; i >= 0; i--) { 1795 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1796 sum = xb[i]; 1797 if (xb == b) { 1798 /* whole matrix (no checkpointing available) */ 1799 n = a->rlen[i]; 1800 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1801 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1802 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1803 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1804 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1805 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1806 } 1807 } 1808 if (xb == b) { 1809 PetscCall(PetscLogFlops(2.0 * a->nz)); 1810 } else { 1811 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1812 } 1813 } 1814 } 1815 PetscCall(VecRestoreArray(xx, &x)); 1816 PetscCall(VecRestoreArrayRead(bb, &b)); 1817 PetscFunctionReturn(PETSC_SUCCESS); 1818 } 1819 1820 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1821 MatGetRow_SeqSELL, 1822 MatRestoreRow_SeqSELL, 1823 MatMult_SeqSELL, 1824 /* 4*/ MatMultAdd_SeqSELL, 1825 MatMultTranspose_SeqSELL, 1826 MatMultTransposeAdd_SeqSELL, 1827 NULL, 1828 NULL, 1829 NULL, 1830 /* 10*/ NULL, 1831 NULL, 1832 NULL, 1833 MatSOR_SeqSELL, 1834 NULL, 1835 /* 15*/ MatGetInfo_SeqSELL, 1836 MatEqual_SeqSELL, 1837 MatGetDiagonal_SeqSELL, 1838 MatDiagonalScale_SeqSELL, 1839 NULL, 1840 /* 20*/ NULL, 1841 MatAssemblyEnd_SeqSELL, 1842 MatSetOption_SeqSELL, 1843 MatZeroEntries_SeqSELL, 1844 /* 24*/ NULL, 1845 NULL, 1846 NULL, 1847 NULL, 1848 NULL, 1849 /* 29*/ MatSetUp_SeqSELL, 1850 NULL, 1851 NULL, 1852 NULL, 1853 NULL, 1854 /* 34*/ MatDuplicate_SeqSELL, 1855 NULL, 1856 NULL, 1857 NULL, 1858 NULL, 1859 /* 39*/ NULL, 1860 NULL, 1861 NULL, 1862 MatGetValues_SeqSELL, 1863 MatCopy_SeqSELL, 1864 /* 44*/ NULL, 1865 MatScale_SeqSELL, 1866 MatShift_SeqSELL, 1867 NULL, 1868 NULL, 1869 /* 49*/ NULL, 1870 NULL, 1871 NULL, 1872 NULL, 1873 NULL, 1874 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1875 NULL, 1876 NULL, 1877 NULL, 1878 NULL, 1879 /* 59*/ NULL, 1880 MatDestroy_SeqSELL, 1881 MatView_SeqSELL, 1882 NULL, 1883 NULL, 1884 /* 64*/ NULL, 1885 NULL, 1886 NULL, 1887 NULL, 1888 NULL, 1889 /* 69*/ NULL, 1890 NULL, 1891 NULL, 1892 NULL, 1893 NULL, 1894 /* 74*/ NULL, 1895 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1896 NULL, 1897 NULL, 1898 NULL, 1899 /* 79*/ NULL, 1900 NULL, 1901 NULL, 1902 NULL, 1903 NULL, 1904 /* 84*/ NULL, 1905 NULL, 1906 NULL, 1907 NULL, 1908 NULL, 1909 /* 89*/ NULL, 1910 NULL, 1911 NULL, 1912 NULL, 1913 NULL, 1914 /* 94*/ NULL, 1915 NULL, 1916 NULL, 1917 NULL, 1918 NULL, 1919 /* 99*/ NULL, 1920 NULL, 1921 NULL, 1922 MatConjugate_SeqSELL, 1923 NULL, 1924 /*104*/ NULL, 1925 NULL, 1926 NULL, 1927 NULL, 1928 NULL, 1929 /*109*/ NULL, 1930 NULL, 1931 NULL, 1932 NULL, 1933 MatMissingDiagonal_SeqSELL, 1934 /*114*/ NULL, 1935 NULL, 1936 NULL, 1937 NULL, 1938 NULL, 1939 /*119*/ NULL, 1940 NULL, 1941 NULL, 1942 NULL, 1943 NULL, 1944 /*124*/ NULL, 1945 NULL, 1946 NULL, 1947 NULL, 1948 NULL, 1949 /*129*/ NULL, 1950 NULL, 1951 NULL, 1952 NULL, 1953 NULL, 1954 /*134*/ NULL, 1955 NULL, 1956 NULL, 1957 NULL, 1958 NULL, 1959 /*139*/ NULL, 1960 NULL, 1961 NULL, 1962 MatFDColoringSetUp_SeqXAIJ, 1963 NULL, 1964 /*144*/ NULL, 1965 NULL, 1966 NULL, 1967 NULL, 1968 NULL, 1969 NULL, 1970 /*150*/ NULL, 1971 NULL}; 1972 1973 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1974 { 1975 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1976 1977 PetscFunctionBegin; 1978 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1979 1980 /* allocate space for values if not already there */ 1981 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1982 1983 /* copy values over */ 1984 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1985 PetscFunctionReturn(PETSC_SUCCESS); 1986 } 1987 1988 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1989 { 1990 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1991 1992 PetscFunctionBegin; 1993 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1994 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1995 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1996 PetscFunctionReturn(PETSC_SUCCESS); 1997 } 1998 1999 PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio) 2000 { 2001 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2002 2003 PetscFunctionBegin; 2004 if (a->totalslices && a->sliidx[a->totalslices]) { 2005 *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices]; 2006 } else { 2007 *ratio = 0.0; 2008 } 2009 PetscFunctionReturn(PETSC_SUCCESS); 2010 } 2011 2012 PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth) 2013 { 2014 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2015 PetscInt i, current_slicewidth; 2016 2017 PetscFunctionBegin; 2018 *slicewidth = 0; 2019 for (i = 0; i < a->totalslices; i++) { 2020 current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight; 2021 if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth; 2022 } 2023 PetscFunctionReturn(PETSC_SUCCESS); 2024 } 2025 2026 PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth) 2027 { 2028 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2029 2030 PetscFunctionBegin; 2031 *slicewidth = 0; 2032 if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; } 2033 PetscFunctionReturn(PETSC_SUCCESS); 2034 } 2035 2036 PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance) 2037 { 2038 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2039 PetscReal mean; 2040 PetscInt i, totalslices = a->totalslices, *sliidx = a->sliidx; 2041 2042 PetscFunctionBegin; 2043 *variance = 0; 2044 if (totalslices) { 2045 mean = (PetscReal)sliidx[totalslices] / totalslices; 2046 for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; } 2047 } 2048 PetscFunctionReturn(PETSC_SUCCESS); 2049 } 2050 2051 PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight) 2052 { 2053 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2054 2055 PetscFunctionBegin; 2056 if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS); 2057 PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight); 2058 a->sliceheight = sliceheight; 2059 #if defined(PETSC_HAVE_CUDA) 2060 PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight); 2061 #endif 2062 PetscFunctionReturn(PETSC_SUCCESS); 2063 } 2064 2065 /*@C 2066 MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()` 2067 2068 Not Collective 2069 2070 Input Parameters: 2071 + A - a `MATSEQSELL` matrix 2072 - array - pointer to the data 2073 2074 Level: intermediate 2075 2076 .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 2077 @*/ 2078 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) 2079 { 2080 PetscFunctionBegin; 2081 PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array)); 2082 PetscFunctionReturn(PETSC_SUCCESS); 2083 } 2084 2085 /*@C 2086 MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix. 2087 2088 Not Collective 2089 2090 Input Parameter: 2091 . A - a MATSEQSELL matrix 2092 2093 Output Parameter: 2094 . ratio - ratio of number of padded zeros to number of allocated elements 2095 2096 Level: intermediate 2097 2098 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2099 @*/ 2100 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio) 2101 { 2102 PetscFunctionBegin; 2103 PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio)); 2104 PetscFunctionReturn(PETSC_SUCCESS); 2105 } 2106 2107 /*@C 2108 MatSeqSELLGetMaxSliceWidth - returns the maximum slice width. 2109 2110 Not Collective 2111 2112 Input Parameter: 2113 . A - a MATSEQSELL matrix 2114 2115 Output Parameter: 2116 . slicewidth - maximum slice width 2117 2118 Level: intermediate 2119 2120 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2121 @*/ 2122 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth) 2123 { 2124 PetscFunctionBegin; 2125 PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth)); 2126 PetscFunctionReturn(PETSC_SUCCESS); 2127 } 2128 2129 /*@C 2130 MatSeqSELLGetAvgSliceWidth - returns the average slice width. 2131 2132 Not Collective 2133 2134 Input Parameter: 2135 . A - a MATSEQSELL matrix 2136 2137 Output Parameter: 2138 . slicewidth - average slice width 2139 2140 Level: intermediate 2141 2142 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()` 2143 @*/ 2144 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth) 2145 { 2146 PetscFunctionBegin; 2147 PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth)); 2148 PetscFunctionReturn(PETSC_SUCCESS); 2149 } 2150 2151 /*@C 2152 MatSeqSELLSetSliceHeight - sets the slice height. 2153 2154 Not Collective 2155 2156 Input Parameters: 2157 + A - a MATSEQSELL matrix 2158 - sliceheight - slice height 2159 2160 Notes: 2161 You cannot change the slice height once it have been set. 2162 2163 The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called. 2164 2165 Level: intermediate 2166 2167 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()` 2168 @*/ 2169 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight) 2170 { 2171 PetscFunctionBegin; 2172 PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight)); 2173 PetscFunctionReturn(PETSC_SUCCESS); 2174 } 2175 2176 /*@C 2177 MatSeqSELLGetVarSliceSize - returns the variance of the slice size. 2178 2179 Not Collective 2180 2181 Input Parameter: 2182 . A - a MATSEQSELL matrix 2183 2184 Output Parameter: 2185 . variance - variance of the slice size 2186 2187 Level: intermediate 2188 2189 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()` 2190 @*/ 2191 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance) 2192 { 2193 PetscFunctionBegin; 2194 PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance)); 2195 PetscFunctionReturn(PETSC_SUCCESS); 2196 } 2197 2198 #if defined(PETSC_HAVE_CUDA) 2199 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 2200 #endif 2201 2202 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 2203 { 2204 Mat_SeqSELL *b; 2205 PetscMPIInt size; 2206 2207 PetscFunctionBegin; 2208 PetscCall(PetscCitationsRegister(citation, &cited)); 2209 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 2210 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 2211 2212 PetscCall(PetscNew(&b)); 2213 2214 B->data = (void *)b; 2215 B->ops[0] = MatOps_Values; 2216 2217 b->row = NULL; 2218 b->col = NULL; 2219 b->icol = NULL; 2220 b->reallocs = 0; 2221 b->ignorezeroentries = PETSC_FALSE; 2222 b->roworiented = PETSC_TRUE; 2223 b->nonew = 0; 2224 b->diag = NULL; 2225 b->solve_work = NULL; 2226 B->spptr = NULL; 2227 b->saved_values = NULL; 2228 b->idiag = NULL; 2229 b->mdiag = NULL; 2230 b->ssor_work = NULL; 2231 b->omega = 1.0; 2232 b->fshift = 0.0; 2233 b->idiagvalid = PETSC_FALSE; 2234 b->keepnonzeropattern = PETSC_FALSE; 2235 b->sliceheight = 0; 2236 2237 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 2238 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 2239 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 2240 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 2241 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 2242 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 2243 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 2244 #if defined(PETSC_HAVE_CUDA) 2245 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA)); 2246 #endif 2247 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL)); 2248 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL)); 2249 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL)); 2250 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL)); 2251 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL)); 2252 2253 PetscObjectOptionsBegin((PetscObject)B); 2254 { 2255 PetscInt newsh = -1; 2256 PetscBool flg; 2257 #if defined(PETSC_HAVE_CUDA) 2258 PetscInt chunksize = 0; 2259 #endif 2260 2261 PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg)); 2262 if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); } 2263 #if defined(PETSC_HAVE_CUDA) 2264 PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg)); 2265 if (flg) { 2266 PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize); 2267 b->chunksize = chunksize; 2268 } 2269 #endif 2270 } 2271 PetscOptionsEnd(); 2272 PetscFunctionReturn(PETSC_SUCCESS); 2273 } 2274 2275 /* 2276 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 2277 */ 2278 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 2279 { 2280 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 2281 PetscInt i, m = A->rmap->n; 2282 PetscInt totalslices = a->totalslices; 2283 2284 PetscFunctionBegin; 2285 C->factortype = A->factortype; 2286 c->row = NULL; 2287 c->col = NULL; 2288 c->icol = NULL; 2289 c->reallocs = 0; 2290 C->assembled = PETSC_TRUE; 2291 2292 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2293 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2294 2295 PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen)); 2296 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2297 2298 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2299 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2300 2301 /* allocate the matrix space */ 2302 if (mallocmatspace) { 2303 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2304 2305 c->singlemalloc = PETSC_TRUE; 2306 2307 if (m > 0) { 2308 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2309 if (cpvalues == MAT_COPY_VALUES) { 2310 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2311 } else { 2312 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2313 } 2314 } 2315 } 2316 2317 c->ignorezeroentries = a->ignorezeroentries; 2318 c->roworiented = a->roworiented; 2319 c->nonew = a->nonew; 2320 if (a->diag) { 2321 PetscCall(PetscMalloc1(m, &c->diag)); 2322 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2323 } else c->diag = NULL; 2324 2325 c->solve_work = NULL; 2326 c->saved_values = NULL; 2327 c->idiag = NULL; 2328 c->ssor_work = NULL; 2329 c->keepnonzeropattern = a->keepnonzeropattern; 2330 c->free_val = PETSC_TRUE; 2331 c->free_colidx = PETSC_TRUE; 2332 2333 c->maxallocmat = a->maxallocmat; 2334 c->maxallocrow = a->maxallocrow; 2335 c->rlenmax = a->rlenmax; 2336 c->nz = a->nz; 2337 C->preallocated = PETSC_TRUE; 2338 2339 c->nonzerorowcnt = a->nonzerorowcnt; 2340 C->nonzerostate = A->nonzerostate; 2341 2342 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2343 PetscFunctionReturn(PETSC_SUCCESS); 2344 } 2345 2346 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2347 { 2348 PetscFunctionBegin; 2349 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2350 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2351 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2352 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2353 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2354 PetscFunctionReturn(PETSC_SUCCESS); 2355 } 2356 2357 /*MC 2358 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2359 based on the sliced Ellpack format 2360 2361 Options Database Key: 2362 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2363 2364 Level: beginner 2365 2366 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2367 M*/ 2368 2369 /*MC 2370 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2371 2372 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2373 and `MATMPISELL` otherwise. As a result, for single process communicators, 2374 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2375 for communicators controlling multiple processes. It is recommended that you call both of 2376 the above preallocation routines for simplicity. 2377 2378 Options Database Key: 2379 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2380 2381 Level: beginner 2382 2383 Notes: 2384 This format is only supported for real scalars, double precision, and 32-bit indices (the defaults). 2385 2386 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2387 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2388 2389 Developer Notes: 2390 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2391 2392 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2393 .vb 2394 (2 0 3 4) 2395 Consider the matrix A = (5 0 6 0) 2396 (0 0 7 8) 2397 (0 0 9 9) 2398 2399 symbolically the Ellpack format can be written as 2400 2401 (2 3 4 |) (0 2 3 |) 2402 v = (5 6 0 |) colidx = (0 2 2 |) 2403 -------- --------- 2404 (7 8 |) (2 3 |) 2405 (9 9 |) (2 3 |) 2406 2407 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2408 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2409 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2410 2411 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2412 2413 .ve 2414 2415 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2416 2417 References: 2418 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2419 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2420 2421 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2422 M*/ 2423 2424 /*@C 2425 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2426 2427 Collective 2428 2429 Input Parameters: 2430 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2431 . m - number of rows 2432 . n - number of columns 2433 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided 2434 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL 2435 2436 Output Parameter: 2437 . A - the matrix 2438 2439 Level: intermediate 2440 2441 Notes: 2442 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2443 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2444 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2445 2446 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 2447 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 2448 allocation. 2449 2450 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL` 2451 @*/ 2452 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A) 2453 { 2454 PetscFunctionBegin; 2455 PetscCall(MatCreate(comm, A)); 2456 PetscCall(MatSetSizes(*A, m, n, m, n)); 2457 PetscCall(MatSetType(*A, MATSEQSELL)); 2458 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen)); 2459 PetscFunctionReturn(PETSC_SUCCESS); 2460 } 2461 2462 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2463 { 2464 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2465 PetscInt totalslices = a->totalslices; 2466 2467 PetscFunctionBegin; 2468 /* If the matrix dimensions are not equal,or no of nonzeros */ 2469 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2470 *flg = PETSC_FALSE; 2471 PetscFunctionReturn(PETSC_SUCCESS); 2472 } 2473 /* if the a->colidx are the same */ 2474 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2475 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2476 /* if a->val are the same */ 2477 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2478 PetscFunctionReturn(PETSC_SUCCESS); 2479 } 2480 2481 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2482 { 2483 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2484 2485 PetscFunctionBegin; 2486 a->idiagvalid = PETSC_FALSE; 2487 PetscFunctionReturn(PETSC_SUCCESS); 2488 } 2489 2490 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2491 { 2492 #if defined(PETSC_USE_COMPLEX) 2493 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2494 PetscInt i; 2495 PetscScalar *val = a->val; 2496 2497 PetscFunctionBegin; 2498 for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); } 2499 #if defined(PETSC_HAVE_CUDA) 2500 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 2501 #endif 2502 #else 2503 PetscFunctionBegin; 2504 #endif 2505 PetscFunctionReturn(PETSC_SUCCESS); 2506 } 2507