1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 11 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 12 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 13 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 14 " year = 2018\n" 15 "}\n"; 16 17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 18 19 #include <immintrin.h> 20 21 #if !defined(_MM_SCALE_8) 22 #define _MM_SCALE_8 8 23 #endif 24 25 #if defined(__AVX512F__) 26 /* these do not work 27 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 28 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 29 */ 30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 31 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 32 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 33 vec_vals = _mm512_loadu_pd(aval); \ 34 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 35 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 36 #elif defined(__AVX2__) && defined(__FMA__) 37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 38 vec_vals = _mm256_loadu_pd(aval); \ 39 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 40 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 41 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 42 #endif 43 #endif /* PETSC_HAVE_IMMINTRIN_H */ 44 45 /*@C 46 MatSeqSELLSetPreallocation - For good matrix assembly performance 47 the user should preallocate the matrix storage by setting the parameter `nz` 48 (or the array `nnz`). 49 50 Collective 51 52 Input Parameters: 53 + B - The `MATSEQSELL` matrix 54 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided 55 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 56 57 Level: intermediate 58 59 Notes: 60 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 61 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 62 allocation. 63 64 You can call `MatGetInfo()` to get information on how effective the preallocation was; 65 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 66 You can also run with the option `-info` and look for messages with the string 67 malloc in them to see if additional memory allocation was needed. 68 69 Developer Note: 70 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 71 entries or columns indices. 72 73 The maximum number of nonzeos in any row should be as accurate as possible. 74 If it is underestimated, you will get bad performance due to reallocation 75 (`MatSeqXSELLReallocateSELL()`). 76 77 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 78 @*/ 79 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 80 { 81 PetscFunctionBegin; 82 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 83 PetscValidType(B, 1); 84 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 85 PetscFunctionReturn(PETSC_SUCCESS); 86 } 87 88 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 89 { 90 Mat_SeqSELL *b; 91 PetscInt i, j, totalslices; 92 #if defined(PETSC_HAVE_CUDA) 93 PetscInt rlenmax = 0; 94 #endif 95 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 96 97 PetscFunctionBegin; 98 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 99 if (maxallocrow == MAT_SKIP_ALLOCATION) { 100 skipallocation = PETSC_TRUE; 101 maxallocrow = 0; 102 } 103 104 PetscCall(PetscLayoutSetUp(B->rmap)); 105 PetscCall(PetscLayoutSetUp(B->cmap)); 106 107 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 108 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 109 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 110 if (rlen) { 111 for (i = 0; i < B->rmap->n; i++) { 112 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 113 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 114 } 115 } 116 117 B->preallocated = PETSC_TRUE; 118 119 b = (Mat_SeqSELL *)B->data; 120 121 if (!b->sliceheight) { /* not set yet */ 122 #if defined(PETSC_HAVE_CUDA) 123 b->sliceheight = 16; 124 #else 125 b->sliceheight = 8; 126 #endif 127 } 128 totalslices = PetscCeilInt(B->rmap->n, b->sliceheight); 129 b->totalslices = totalslices; 130 if (!skipallocation) { 131 if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n)); 132 133 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 134 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 135 } 136 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 137 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 138 else if (maxallocrow < 0) maxallocrow = 1; 139 #if defined(PETSC_HAVE_CUDA) 140 rlenmax = maxallocrow; 141 /* Pad the slice to DEVICE_MEM_ALIGN */ 142 while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++; 143 #endif 144 for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow; 145 } else { 146 #if defined(PETSC_HAVE_CUDA) 147 PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight; 148 #endif 149 maxallocrow = 0; 150 b->sliidx[0] = 0; 151 for (i = 1; i < totalslices; i++) { 152 b->sliidx[i] = 0; 153 for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); } 154 #if defined(PETSC_HAVE_CUDA) 155 rlenmax = PetscMax(b->sliidx[i], rlenmax); 156 /* Pad the slice to DEVICE_MEM_ALIGN */ 157 b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul; 158 #endif 159 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 160 PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i])); 161 } 162 /* last slice */ 163 b->sliidx[totalslices] = 0; 164 for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 165 #if defined(PETSC_HAVE_CUDA) 166 rlenmax = PetscMax(b->sliidx[i], rlenmax); 167 b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul; 168 #endif 169 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 170 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices]; 171 } 172 173 /* allocate space for val, colidx, rlen */ 174 /* FIXME: should B's old memory be unlogged? */ 175 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 176 /* FIXME: assuming an element of the bit array takes 8 bits */ 177 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 178 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 179 PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen)); 180 181 b->singlemalloc = PETSC_TRUE; 182 b->free_val = PETSC_TRUE; 183 b->free_colidx = PETSC_TRUE; 184 } else { 185 b->free_val = PETSC_FALSE; 186 b->free_colidx = PETSC_FALSE; 187 } 188 189 b->nz = 0; 190 b->maxallocrow = maxallocrow; 191 #if defined(PETSC_HAVE_CUDA) 192 b->rlenmax = rlenmax; 193 #else 194 b->rlenmax = maxallocrow; 195 #endif 196 b->maxallocmat = b->sliidx[totalslices]; 197 B->info.nz_unneeded = (double)b->maxallocmat; 198 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 199 PetscFunctionReturn(PETSC_SUCCESS); 200 } 201 202 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 203 { 204 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 205 PetscInt shift; 206 207 PetscFunctionBegin; 208 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 209 if (nz) *nz = a->rlen[row]; 210 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); 211 if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); } 212 if (idx) { 213 PetscInt j; 214 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j]; 215 *idx = a->getrowcols; 216 } 217 if (v) { 218 PetscInt j; 219 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j]; 220 *v = a->getrowvals; 221 } 222 PetscFunctionReturn(PETSC_SUCCESS); 223 } 224 225 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 226 { 227 PetscFunctionBegin; 228 PetscFunctionReturn(PETSC_SUCCESS); 229 } 230 231 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 232 { 233 Mat B; 234 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 235 PetscInt i; 236 237 PetscFunctionBegin; 238 if (reuse == MAT_REUSE_MATRIX) { 239 B = *newmat; 240 PetscCall(MatZeroEntries(B)); 241 } else { 242 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 243 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 244 PetscCall(MatSetType(B, MATSEQAIJ)); 245 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 246 } 247 248 for (i = 0; i < A->rmap->n; i++) { 249 PetscInt nz = 0, *cols = NULL; 250 PetscScalar *vals = NULL; 251 252 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 253 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 254 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 255 } 256 257 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 258 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 259 B->rmap->bs = A->rmap->bs; 260 261 if (reuse == MAT_INPLACE_MATRIX) { 262 PetscCall(MatHeaderReplace(A, &B)); 263 } else { 264 *newmat = B; 265 } 266 PetscFunctionReturn(PETSC_SUCCESS); 267 } 268 269 #include <../src/mat/impls/aij/seq/aij.h> 270 271 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 272 { 273 Mat B; 274 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 275 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 276 const PetscInt *cols; 277 const PetscScalar *vals; 278 279 PetscFunctionBegin; 280 281 if (reuse == MAT_REUSE_MATRIX) { 282 B = *newmat; 283 } else { 284 if (PetscDefined(USE_DEBUG) || !a->ilen) { 285 PetscCall(PetscMalloc1(m, &rowlengths)); 286 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 287 } 288 if (PetscDefined(USE_DEBUG) && a->ilen) { 289 PetscBool eq; 290 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 291 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 292 PetscCall(PetscFree(rowlengths)); 293 rowlengths = a->ilen; 294 } else if (a->ilen) rowlengths = a->ilen; 295 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 296 PetscCall(MatSetSizes(B, m, n, m, n)); 297 PetscCall(MatSetType(B, MATSEQSELL)); 298 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 299 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 300 } 301 302 for (row = 0; row < m; row++) { 303 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 304 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 305 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 306 } 307 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 308 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 309 B->rmap->bs = A->rmap->bs; 310 311 if (reuse == MAT_INPLACE_MATRIX) { 312 PetscCall(MatHeaderReplace(A, &B)); 313 } else { 314 *newmat = B; 315 } 316 PetscFunctionReturn(PETSC_SUCCESS); 317 } 318 319 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 320 { 321 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 322 PetscScalar *y; 323 const PetscScalar *x; 324 const MatScalar *aval = a->val; 325 PetscInt totalslices = a->totalslices; 326 const PetscInt *acolidx = a->colidx; 327 PetscInt i, j; 328 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 329 __m512d vec_x, vec_y, vec_vals; 330 __m256i vec_idx; 331 __mmask8 mask; 332 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 333 __m256i vec_idx2, vec_idx3, vec_idx4; 334 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 335 __m128i vec_idx; 336 __m256d vec_x, vec_y, vec_y2, vec_vals; 337 MatScalar yval; 338 PetscInt r, rows_left, row, nnz_in_row; 339 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 340 __m128d vec_x_tmp; 341 __m256d vec_x, vec_y, vec_y2, vec_vals; 342 MatScalar yval; 343 PetscInt r, rows_left, row, nnz_in_row; 344 #else 345 PetscInt k, sliceheight = a->sliceheight; 346 PetscScalar *sum; 347 #endif 348 349 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 350 #pragma disjoint(*x, *y, *aval) 351 #endif 352 353 PetscFunctionBegin; 354 PetscCall(VecGetArrayRead(xx, &x)); 355 PetscCall(VecGetArray(yy, &y)); 356 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 357 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 358 for (i = 0; i < totalslices; i++) { /* loop over slices */ 359 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 360 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 361 362 vec_y = _mm512_setzero_pd(); 363 vec_y2 = _mm512_setzero_pd(); 364 vec_y3 = _mm512_setzero_pd(); 365 vec_y4 = _mm512_setzero_pd(); 366 367 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 368 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 369 case 3: 370 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 371 acolidx += 8; 372 aval += 8; 373 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 374 acolidx += 8; 375 aval += 8; 376 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 377 acolidx += 8; 378 aval += 8; 379 j += 3; 380 break; 381 case 2: 382 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 383 acolidx += 8; 384 aval += 8; 385 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 386 acolidx += 8; 387 aval += 8; 388 j += 2; 389 break; 390 case 1: 391 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 392 acolidx += 8; 393 aval += 8; 394 j += 1; 395 break; 396 } 397 #pragma novector 398 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 399 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 400 acolidx += 8; 401 aval += 8; 402 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 403 acolidx += 8; 404 aval += 8; 405 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 406 acolidx += 8; 407 aval += 8; 408 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 409 acolidx += 8; 410 aval += 8; 411 } 412 413 vec_y = _mm512_add_pd(vec_y, vec_y2); 414 vec_y = _mm512_add_pd(vec_y, vec_y3); 415 vec_y = _mm512_add_pd(vec_y, vec_y4); 416 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 417 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 418 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 419 } else { 420 _mm512_storeu_pd(&y[8 * i], vec_y); 421 } 422 } 423 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 424 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 425 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 426 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 427 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 428 429 /* last slice may have padding rows. Don't use vectorization. */ 430 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 431 rows_left = A->rmap->n - 8 * i; 432 for (r = 0; r < rows_left; ++r) { 433 yval = (MatScalar)0; 434 row = 8 * i + r; 435 nnz_in_row = a->rlen[row]; 436 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 437 y[row] = yval; 438 } 439 break; 440 } 441 442 vec_y = _mm256_setzero_pd(); 443 vec_y2 = _mm256_setzero_pd(); 444 445 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 446 #pragma novector 447 #pragma unroll(2) 448 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 449 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 450 aval += 4; 451 acolidx += 4; 452 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 453 aval += 4; 454 acolidx += 4; 455 } 456 457 _mm256_storeu_pd(y + i * 8, vec_y); 458 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 459 } 460 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 461 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 462 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 463 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 464 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 465 466 vec_y = _mm256_setzero_pd(); 467 vec_y2 = _mm256_setzero_pd(); 468 469 /* last slice may have padding rows. Don't use vectorization. */ 470 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 471 rows_left = A->rmap->n - 8 * i; 472 for (r = 0; r < rows_left; ++r) { 473 yval = (MatScalar)0; 474 row = 8 * i + r; 475 nnz_in_row = a->rlen[row]; 476 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 477 y[row] = yval; 478 } 479 break; 480 } 481 482 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 483 #pragma novector 484 #pragma unroll(2) 485 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 486 vec_vals = _mm256_loadu_pd(aval); 487 vec_x_tmp = _mm_setzero_pd(); 488 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 489 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 490 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 491 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 492 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 493 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 494 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 495 aval += 4; 496 497 vec_vals = _mm256_loadu_pd(aval); 498 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 499 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 500 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 501 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 502 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 503 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 504 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 505 aval += 4; 506 } 507 508 _mm256_storeu_pd(y + i * 8, vec_y); 509 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 510 } 511 #else 512 PetscCall(PetscMalloc1(sliceheight, &sum)); 513 for (i = 0; i < totalslices; i++) { /* loop over slices */ 514 for (j = 0; j < sliceheight; j++) { 515 sum[j] = 0.0; 516 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 517 } 518 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */ 519 for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j]; 520 } else { 521 for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j]; 522 } 523 } 524 PetscCall(PetscFree(sum)); 525 #endif 526 527 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 528 PetscCall(VecRestoreArrayRead(xx, &x)); 529 PetscCall(VecRestoreArray(yy, &y)); 530 PetscFunctionReturn(PETSC_SUCCESS); 531 } 532 533 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 534 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 535 { 536 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 537 PetscScalar *y, *z; 538 const PetscScalar *x; 539 const MatScalar *aval = a->val; 540 PetscInt totalslices = a->totalslices; 541 const PetscInt *acolidx = a->colidx; 542 PetscInt i, j; 543 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 544 __m512d vec_x, vec_y, vec_vals; 545 __m256i vec_idx; 546 __mmask8 mask = 0; 547 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 548 __m256i vec_idx2, vec_idx3, vec_idx4; 549 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 550 __m128d vec_x_tmp; 551 __m256d vec_x, vec_y, vec_y2, vec_vals; 552 MatScalar yval; 553 PetscInt r, row, nnz_in_row; 554 #else 555 PetscInt k, sliceheight = a->sliceheight; 556 PetscScalar *sum; 557 #endif 558 559 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 560 #pragma disjoint(*x, *y, *aval) 561 #endif 562 563 PetscFunctionBegin; 564 if (!a->nz) { 565 PetscCall(VecCopy(yy, zz)); 566 PetscFunctionReturn(PETSC_SUCCESS); 567 } 568 PetscCall(VecGetArrayRead(xx, &x)); 569 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 570 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 571 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 572 for (i = 0; i < totalslices; i++) { /* loop over slices */ 573 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 574 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 575 576 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 577 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 578 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 579 } else { 580 vec_y = _mm512_loadu_pd(&y[8 * i]); 581 } 582 vec_y2 = _mm512_setzero_pd(); 583 vec_y3 = _mm512_setzero_pd(); 584 vec_y4 = _mm512_setzero_pd(); 585 586 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 587 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 588 case 3: 589 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 590 acolidx += 8; 591 aval += 8; 592 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 593 acolidx += 8; 594 aval += 8; 595 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 596 acolidx += 8; 597 aval += 8; 598 j += 3; 599 break; 600 case 2: 601 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 602 acolidx += 8; 603 aval += 8; 604 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 605 acolidx += 8; 606 aval += 8; 607 j += 2; 608 break; 609 case 1: 610 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 611 acolidx += 8; 612 aval += 8; 613 j += 1; 614 break; 615 } 616 #pragma novector 617 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 618 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 619 acolidx += 8; 620 aval += 8; 621 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 622 acolidx += 8; 623 aval += 8; 624 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 625 acolidx += 8; 626 aval += 8; 627 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 628 acolidx += 8; 629 aval += 8; 630 } 631 632 vec_y = _mm512_add_pd(vec_y, vec_y2); 633 vec_y = _mm512_add_pd(vec_y, vec_y3); 634 vec_y = _mm512_add_pd(vec_y, vec_y4); 635 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 636 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 637 } else { 638 _mm512_storeu_pd(&z[8 * i], vec_y); 639 } 640 } 641 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 642 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 643 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 644 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 645 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 646 647 /* last slice may have padding rows. Don't use vectorization. */ 648 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 649 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 650 row = 8 * i + r; 651 yval = (MatScalar)0.0; 652 nnz_in_row = a->rlen[row]; 653 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 654 z[row] = y[row] + yval; 655 } 656 break; 657 } 658 659 vec_y = _mm256_loadu_pd(y + 8 * i); 660 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 661 662 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 663 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 664 vec_vals = _mm256_loadu_pd(aval); 665 vec_x_tmp = _mm_setzero_pd(); 666 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 667 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 668 vec_x = _mm256_setzero_pd(); 669 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 670 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 671 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 672 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 673 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 674 aval += 4; 675 676 vec_vals = _mm256_loadu_pd(aval); 677 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 678 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 679 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 680 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 681 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 682 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 683 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 684 aval += 4; 685 } 686 687 _mm256_storeu_pd(z + i * 8, vec_y); 688 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 689 } 690 #else 691 PetscCall(PetscMalloc1(sliceheight, &sum)); 692 for (i = 0; i < totalslices; i++) { /* loop over slices */ 693 for (j = 0; j < sliceheight; j++) { 694 sum[j] = 0.0; 695 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 696 } 697 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 698 for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 699 } else { 700 for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 701 } 702 } 703 PetscCall(PetscFree(sum)); 704 #endif 705 706 PetscCall(PetscLogFlops(2.0 * a->nz)); 707 PetscCall(VecRestoreArrayRead(xx, &x)); 708 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 709 PetscFunctionReturn(PETSC_SUCCESS); 710 } 711 712 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 713 { 714 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 715 PetscScalar *y; 716 const PetscScalar *x; 717 const MatScalar *aval = a->val; 718 const PetscInt *acolidx = a->colidx; 719 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight; 720 721 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 722 #pragma disjoint(*x, *y, *aval) 723 #endif 724 725 PetscFunctionBegin; 726 if (A->symmetric == PETSC_BOOL3_TRUE) { 727 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 728 PetscFunctionReturn(PETSC_SUCCESS); 729 } 730 if (zz != yy) PetscCall(VecCopy(zz, yy)); 731 732 if (a->nz) { 733 PetscCall(VecGetArrayRead(xx, &x)); 734 PetscCall(VecGetArray(yy, &y)); 735 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 736 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 737 for (r = 0; r < (A->rmap->n % sliceheight); ++r) { 738 row = sliceheight * i + r; 739 nnz_in_row = a->rlen[row]; 740 for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row]; 741 } 742 break; 743 } 744 for (r = 0; r < sliceheight; ++r) 745 for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r]; 746 } 747 PetscCall(PetscLogFlops(2.0 * a->nz)); 748 PetscCall(VecRestoreArrayRead(xx, &x)); 749 PetscCall(VecRestoreArray(yy, &y)); 750 } 751 PetscFunctionReturn(PETSC_SUCCESS); 752 } 753 754 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 755 { 756 PetscFunctionBegin; 757 if (A->symmetric == PETSC_BOOL3_TRUE) { 758 PetscCall(MatMult_SeqSELL(A, xx, yy)); 759 } else { 760 PetscCall(VecSet(yy, 0.0)); 761 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 762 } 763 PetscFunctionReturn(PETSC_SUCCESS); 764 } 765 766 /* 767 Checks for missing diagonals 768 */ 769 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 770 { 771 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 772 PetscInt *diag, i; 773 774 PetscFunctionBegin; 775 *missing = PETSC_FALSE; 776 if (A->rmap->n > 0 && !(a->colidx)) { 777 *missing = PETSC_TRUE; 778 if (d) *d = 0; 779 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 780 } else { 781 diag = a->diag; 782 for (i = 0; i < A->rmap->n; i++) { 783 if (diag[i] == -1) { 784 *missing = PETSC_TRUE; 785 if (d) *d = i; 786 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 787 break; 788 } 789 } 790 } 791 PetscFunctionReturn(PETSC_SUCCESS); 792 } 793 794 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 795 { 796 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 797 PetscInt i, j, m = A->rmap->n, shift; 798 799 PetscFunctionBegin; 800 if (!a->diag) { 801 PetscCall(PetscMalloc1(m, &a->diag)); 802 a->free_diag = PETSC_TRUE; 803 } 804 for (i = 0; i < m; i++) { /* loop over rows */ 805 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 806 a->diag[i] = -1; 807 for (j = 0; j < a->rlen[i]; j++) { 808 if (a->colidx[shift + a->sliceheight * j] == i) { 809 a->diag[i] = shift + a->sliceheight * j; 810 break; 811 } 812 } 813 } 814 PetscFunctionReturn(PETSC_SUCCESS); 815 } 816 817 /* 818 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 819 */ 820 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 821 { 822 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 823 PetscInt i, *diag, m = A->rmap->n; 824 MatScalar *val = a->val; 825 PetscScalar *idiag, *mdiag; 826 827 PetscFunctionBegin; 828 if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS); 829 PetscCall(MatMarkDiagonal_SeqSELL(A)); 830 diag = a->diag; 831 if (!a->idiag) { 832 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 833 val = a->val; 834 } 835 mdiag = a->mdiag; 836 idiag = a->idiag; 837 838 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 839 for (i = 0; i < m; i++) { 840 mdiag[i] = val[diag[i]]; 841 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 842 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 843 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 844 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 845 A->factorerror_zeropivot_value = 0.0; 846 A->factorerror_zeropivot_row = i; 847 } 848 idiag[i] = 1.0 / val[diag[i]]; 849 } 850 PetscCall(PetscLogFlops(m)); 851 } else { 852 for (i = 0; i < m; i++) { 853 mdiag[i] = val[diag[i]]; 854 idiag[i] = omega / (fshift + val[diag[i]]); 855 } 856 PetscCall(PetscLogFlops(2.0 * m)); 857 } 858 a->idiagvalid = PETSC_TRUE; 859 PetscFunctionReturn(PETSC_SUCCESS); 860 } 861 862 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 863 { 864 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 865 866 PetscFunctionBegin; 867 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 868 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 869 PetscFunctionReturn(PETSC_SUCCESS); 870 } 871 872 PetscErrorCode MatDestroy_SeqSELL(Mat A) 873 { 874 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 875 876 PetscFunctionBegin; 877 #if defined(PETSC_USE_LOG) 878 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 879 #endif 880 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 881 PetscCall(ISDestroy(&a->row)); 882 PetscCall(ISDestroy(&a->col)); 883 PetscCall(PetscFree(a->diag)); 884 PetscCall(PetscFree(a->rlen)); 885 PetscCall(PetscFree(a->sliidx)); 886 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 887 PetscCall(PetscFree(a->solve_work)); 888 PetscCall(ISDestroy(&a->icol)); 889 PetscCall(PetscFree(a->saved_values)); 890 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 891 PetscCall(PetscFree(A->data)); 892 #if defined(PETSC_HAVE_CUDA) 893 PetscCall(PetscFree(a->chunk_slice_map)); 894 #endif 895 896 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 897 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 898 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 899 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 900 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 901 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 902 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 903 #if defined(PETSC_HAVE_CUDA) 904 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL)); 905 #endif 906 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL)); 907 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL)); 908 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL)); 909 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL)); 910 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL)); 911 PetscFunctionReturn(PETSC_SUCCESS); 912 } 913 914 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 915 { 916 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 917 918 PetscFunctionBegin; 919 switch (op) { 920 case MAT_ROW_ORIENTED: 921 a->roworiented = flg; 922 break; 923 case MAT_KEEP_NONZERO_PATTERN: 924 a->keepnonzeropattern = flg; 925 break; 926 case MAT_NEW_NONZERO_LOCATIONS: 927 a->nonew = (flg ? 0 : 1); 928 break; 929 case MAT_NEW_NONZERO_LOCATION_ERR: 930 a->nonew = (flg ? -1 : 0); 931 break; 932 case MAT_NEW_NONZERO_ALLOCATION_ERR: 933 a->nonew = (flg ? -2 : 0); 934 break; 935 case MAT_UNUSED_NONZERO_LOCATION_ERR: 936 a->nounused = (flg ? -1 : 0); 937 break; 938 case MAT_FORCE_DIAGONAL_ENTRIES: 939 case MAT_IGNORE_OFF_PROC_ENTRIES: 940 case MAT_USE_HASH_TABLE: 941 case MAT_SORTED_FULL: 942 PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); 943 break; 944 case MAT_SPD: 945 case MAT_SYMMETRIC: 946 case MAT_STRUCTURALLY_SYMMETRIC: 947 case MAT_HERMITIAN: 948 case MAT_SYMMETRY_ETERNAL: 949 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 950 case MAT_SPD_ETERNAL: 951 /* These options are handled directly by MatSetOption() */ 952 break; 953 default: 954 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op); 955 } 956 PetscFunctionReturn(PETSC_SUCCESS); 957 } 958 959 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 960 { 961 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 962 PetscInt i, j, n, shift; 963 PetscScalar *x, zero = 0.0; 964 965 PetscFunctionBegin; 966 PetscCall(VecGetLocalSize(v, &n)); 967 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 968 969 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 970 PetscInt *diag = a->diag; 971 PetscCall(VecGetArray(v, &x)); 972 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 973 PetscCall(VecRestoreArray(v, &x)); 974 PetscFunctionReturn(PETSC_SUCCESS); 975 } 976 977 PetscCall(VecSet(v, zero)); 978 PetscCall(VecGetArray(v, &x)); 979 for (i = 0; i < n; i++) { /* loop over rows */ 980 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 981 x[i] = 0; 982 for (j = 0; j < a->rlen[i]; j++) { 983 if (a->colidx[shift + a->sliceheight * j] == i) { 984 x[i] = a->val[shift + a->sliceheight * j]; 985 break; 986 } 987 } 988 } 989 PetscCall(VecRestoreArray(v, &x)); 990 PetscFunctionReturn(PETSC_SUCCESS); 991 } 992 993 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 994 { 995 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 996 const PetscScalar *l, *r; 997 PetscInt i, j, m, n, row; 998 999 PetscFunctionBegin; 1000 if (ll) { 1001 /* The local size is used so that VecMPI can be passed to this routine 1002 by MatDiagonalScale_MPISELL */ 1003 PetscCall(VecGetLocalSize(ll, &m)); 1004 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1005 PetscCall(VecGetArrayRead(ll, &l)); 1006 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1007 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1008 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1009 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row]; 1010 } 1011 } else { 1012 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; } 1013 } 1014 } 1015 PetscCall(VecRestoreArrayRead(ll, &l)); 1016 PetscCall(PetscLogFlops(a->nz)); 1017 } 1018 if (rr) { 1019 PetscCall(VecGetLocalSize(rr, &n)); 1020 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1021 PetscCall(VecGetArrayRead(rr, &r)); 1022 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1023 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1024 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) { 1025 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]]; 1026 } 1027 } else { 1028 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 1029 } 1030 } 1031 PetscCall(VecRestoreArrayRead(rr, &r)); 1032 PetscCall(PetscLogFlops(a->nz)); 1033 } 1034 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1035 #if defined(PETSC_HAVE_CUDA) 1036 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1037 #endif 1038 PetscFunctionReturn(PETSC_SUCCESS); 1039 } 1040 1041 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1042 { 1043 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1044 PetscInt *cp, i, k, low, high, t, row, col, l; 1045 PetscInt shift; 1046 MatScalar *vp; 1047 1048 PetscFunctionBegin; 1049 for (k = 0; k < m; k++) { /* loop over requested rows */ 1050 row = im[k]; 1051 if (row < 0) continue; 1052 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1053 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */ 1054 cp = a->colidx + shift; /* pointer to the row */ 1055 vp = a->val + shift; /* pointer to the row */ 1056 for (l = 0; l < n; l++) { /* loop over requested columns */ 1057 col = in[l]; 1058 if (col < 0) continue; 1059 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1060 high = a->rlen[row]; 1061 low = 0; /* assume unsorted */ 1062 while (high - low > 5) { 1063 t = (low + high) / 2; 1064 if (*(cp + a->sliceheight * t) > col) high = t; 1065 else low = t; 1066 } 1067 for (i = low; i < high; i++) { 1068 if (*(cp + a->sliceheight * i) > col) break; 1069 if (*(cp + a->sliceheight * i) == col) { 1070 *v++ = *(vp + a->sliceheight * i); 1071 goto finished; 1072 } 1073 } 1074 *v++ = 0.0; 1075 finished:; 1076 } 1077 } 1078 PetscFunctionReturn(PETSC_SUCCESS); 1079 } 1080 1081 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1082 { 1083 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1084 PetscInt i, j, m = A->rmap->n, shift; 1085 const char *name; 1086 PetscViewerFormat format; 1087 1088 PetscFunctionBegin; 1089 PetscCall(PetscViewerGetFormat(viewer, &format)); 1090 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1091 PetscInt nofinalvalue = 0; 1092 /* 1093 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1094 nofinalvalue = 1; 1095 } 1096 */ 1097 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1098 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1099 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1100 #if defined(PETSC_USE_COMPLEX) 1101 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1102 #else 1103 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1104 #endif 1105 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1106 1107 for (i = 0; i < m; i++) { 1108 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1109 for (j = 0; j < a->rlen[i]; j++) { 1110 #if defined(PETSC_USE_COMPLEX) 1111 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1112 #else 1113 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j])); 1114 #endif 1115 } 1116 } 1117 /* 1118 if (nofinalvalue) { 1119 #if defined(PETSC_USE_COMPLEX) 1120 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1121 #else 1122 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1123 #endif 1124 } 1125 */ 1126 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1127 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1128 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1129 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1130 PetscFunctionReturn(PETSC_SUCCESS); 1131 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1132 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1133 for (i = 0; i < m; i++) { 1134 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1135 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1136 for (j = 0; j < a->rlen[i]; j++) { 1137 #if defined(PETSC_USE_COMPLEX) 1138 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1139 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1140 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1141 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1142 } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1143 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1144 } 1145 #else 1146 if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1147 #endif 1148 } 1149 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1150 } 1151 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1152 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1153 PetscInt cnt = 0, jcnt; 1154 PetscScalar value; 1155 #if defined(PETSC_USE_COMPLEX) 1156 PetscBool realonly = PETSC_TRUE; 1157 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1158 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1159 realonly = PETSC_FALSE; 1160 break; 1161 } 1162 } 1163 #endif 1164 1165 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1166 for (i = 0; i < m; i++) { 1167 jcnt = 0; 1168 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1169 for (j = 0; j < A->cmap->n; j++) { 1170 if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) { 1171 value = a->val[cnt++]; 1172 jcnt++; 1173 } else { 1174 value = 0.0; 1175 } 1176 #if defined(PETSC_USE_COMPLEX) 1177 if (realonly) { 1178 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1179 } else { 1180 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1181 } 1182 #else 1183 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1184 #endif 1185 } 1186 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1187 } 1188 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1189 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1190 PetscInt fshift = 1; 1191 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1192 #if defined(PETSC_USE_COMPLEX) 1193 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1194 #else 1195 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1196 #endif 1197 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1198 for (i = 0; i < m; i++) { 1199 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1200 for (j = 0; j < a->rlen[i]; j++) { 1201 #if defined(PETSC_USE_COMPLEX) 1202 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1203 #else 1204 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j])); 1205 #endif 1206 } 1207 } 1208 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1209 } else if (format == PETSC_VIEWER_NATIVE) { 1210 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1211 PetscInt row; 1212 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1213 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1214 #if defined(PETSC_USE_COMPLEX) 1215 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1216 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1217 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1218 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1219 } else { 1220 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1221 } 1222 #else 1223 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j])); 1224 #endif 1225 } 1226 } 1227 } else { 1228 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1229 if (A->factortype) { 1230 for (i = 0; i < m; i++) { 1231 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1232 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1233 /* L part */ 1234 for (j = shift; j < a->diag[i]; j += a->sliceheight) { 1235 #if defined(PETSC_USE_COMPLEX) 1236 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) { 1237 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1238 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) { 1239 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1240 } else { 1241 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1242 } 1243 #else 1244 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1245 #endif 1246 } 1247 /* diagonal */ 1248 j = a->diag[i]; 1249 #if defined(PETSC_USE_COMPLEX) 1250 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1251 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1252 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1253 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1254 } else { 1255 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1256 } 1257 #else 1258 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]))); 1259 #endif 1260 1261 /* U part */ 1262 for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) { 1263 #if defined(PETSC_USE_COMPLEX) 1264 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1265 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1266 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1267 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1268 } else { 1269 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1270 } 1271 #else 1272 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1273 #endif 1274 } 1275 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1276 } 1277 } else { 1278 for (i = 0; i < m; i++) { 1279 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1280 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1281 for (j = 0; j < a->rlen[i]; j++) { 1282 #if defined(PETSC_USE_COMPLEX) 1283 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1284 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1285 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1286 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1287 } else { 1288 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1289 } 1290 #else 1291 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1292 #endif 1293 } 1294 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1295 } 1296 } 1297 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1298 } 1299 PetscCall(PetscViewerFlush(viewer)); 1300 PetscFunctionReturn(PETSC_SUCCESS); 1301 } 1302 1303 #include <petscdraw.h> 1304 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1305 { 1306 Mat A = (Mat)Aa; 1307 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1308 PetscInt i, j, m = A->rmap->n, shift; 1309 int color; 1310 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1311 PetscViewer viewer; 1312 PetscViewerFormat format; 1313 1314 PetscFunctionBegin; 1315 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1316 PetscCall(PetscViewerGetFormat(viewer, &format)); 1317 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1318 1319 /* loop over matrix elements drawing boxes */ 1320 1321 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1322 PetscDrawCollectiveBegin(draw); 1323 /* Blue for negative, Cyan for zero and Red for positive */ 1324 color = PETSC_DRAW_BLUE; 1325 for (i = 0; i < m; i++) { 1326 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1327 y_l = m - i - 1.0; 1328 y_r = y_l + 1.0; 1329 for (j = 0; j < a->rlen[i]; j++) { 1330 x_l = a->colidx[shift + a->sliceheight * j]; 1331 x_r = x_l + 1.0; 1332 if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue; 1333 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1334 } 1335 } 1336 color = PETSC_DRAW_CYAN; 1337 for (i = 0; i < m; i++) { 1338 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1339 y_l = m - i - 1.0; 1340 y_r = y_l + 1.0; 1341 for (j = 0; j < a->rlen[i]; j++) { 1342 x_l = a->colidx[shift + a->sliceheight * j]; 1343 x_r = x_l + 1.0; 1344 if (a->val[shift + a->sliceheight * j] != 0.) continue; 1345 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1346 } 1347 } 1348 color = PETSC_DRAW_RED; 1349 for (i = 0; i < m; i++) { 1350 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1351 y_l = m - i - 1.0; 1352 y_r = y_l + 1.0; 1353 for (j = 0; j < a->rlen[i]; j++) { 1354 x_l = a->colidx[shift + a->sliceheight * j]; 1355 x_r = x_l + 1.0; 1356 if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue; 1357 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1358 } 1359 } 1360 PetscDrawCollectiveEnd(draw); 1361 } else { 1362 /* use contour shading to indicate magnitude of values */ 1363 /* first determine max of all nonzero values */ 1364 PetscReal minv = 0.0, maxv = 0.0; 1365 PetscInt count = 0; 1366 PetscDraw popup; 1367 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1368 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1369 } 1370 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1371 PetscCall(PetscDrawGetPopup(draw, &popup)); 1372 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1373 1374 PetscDrawCollectiveBegin(draw); 1375 for (i = 0; i < m; i++) { 1376 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1377 y_l = m - i - 1.0; 1378 y_r = y_l + 1.0; 1379 for (j = 0; j < a->rlen[i]; j++) { 1380 x_l = a->colidx[shift + a->sliceheight * j]; 1381 x_r = x_l + 1.0; 1382 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1383 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1384 count++; 1385 } 1386 } 1387 PetscDrawCollectiveEnd(draw); 1388 } 1389 PetscFunctionReturn(PETSC_SUCCESS); 1390 } 1391 1392 #include <petscdraw.h> 1393 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1394 { 1395 PetscDraw draw; 1396 PetscReal xr, yr, xl, yl, h, w; 1397 PetscBool isnull; 1398 1399 PetscFunctionBegin; 1400 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1401 PetscCall(PetscDrawIsNull(draw, &isnull)); 1402 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1403 1404 xr = A->cmap->n; 1405 yr = A->rmap->n; 1406 h = yr / 10.0; 1407 w = xr / 10.0; 1408 xr += w; 1409 yr += h; 1410 xl = -w; 1411 yl = -h; 1412 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1413 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1414 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1415 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1416 PetscCall(PetscDrawSave(draw)); 1417 PetscFunctionReturn(PETSC_SUCCESS); 1418 } 1419 1420 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1421 { 1422 PetscBool iascii, isbinary, isdraw; 1423 1424 PetscFunctionBegin; 1425 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1426 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1427 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1428 if (iascii) { 1429 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1430 } else if (isbinary) { 1431 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1432 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1433 PetscFunctionReturn(PETSC_SUCCESS); 1434 } 1435 1436 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1437 { 1438 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1439 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1440 MatScalar *vp; 1441 #if defined(PETSC_HAVE_CUDA) 1442 PetscInt totalchunks = 0; 1443 #endif 1444 1445 PetscFunctionBegin; 1446 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1447 /* To do: compress out the unused elements */ 1448 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1449 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1450 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1451 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1452 a->nonzerorowcnt = 0; 1453 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1454 for (i = 0; i < a->totalslices; ++i) { 1455 shift = a->sliidx[i]; /* starting index of the slice */ 1456 cp = a->colidx + shift; /* pointer to the column indices of the slice */ 1457 vp = a->val + shift; /* pointer to the nonzero values of the slice */ 1458 for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */ 1459 row = a->sliceheight * i + row_in_slice; 1460 nrow = a->rlen[row]; /* number of nonzeros in row */ 1461 /* 1462 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1463 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1464 */ 1465 lastcol = 0; 1466 if (nrow > 0) { /* nonempty row */ 1467 a->nonzerorowcnt++; 1468 lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1469 } else if (!row_in_slice) { /* first row of the correct slice is empty */ 1470 for (j = 1; j < a->sliceheight; j++) { 1471 if (a->rlen[a->sliceheight * i + j]) { 1472 lastcol = cp[j]; 1473 break; 1474 } 1475 } 1476 } else { 1477 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1478 } 1479 1480 for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) { 1481 cp[a->sliceheight * k + row_in_slice] = lastcol; 1482 vp[a->sliceheight * k + row_in_slice] = (MatScalar)0; 1483 } 1484 } 1485 } 1486 1487 A->info.mallocs += a->reallocs; 1488 a->reallocs = 0; 1489 1490 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1491 #if defined(PETSC_HAVE_CUDA) 1492 if (!a->chunksize && a->totalslices) { 1493 a->chunksize = 64; 1494 while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2; 1495 totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize; 1496 } 1497 if (totalchunks != a->totalchunks) { 1498 PetscCall(PetscFree(a->chunk_slice_map)); 1499 PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map)); 1500 a->totalchunks = totalchunks; 1501 } 1502 j = 0; 1503 for (i = 0; i < totalchunks; i++) { 1504 while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++; 1505 a->chunk_slice_map[i] = j; 1506 } 1507 #endif 1508 PetscFunctionReturn(PETSC_SUCCESS); 1509 } 1510 1511 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1512 { 1513 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1514 1515 PetscFunctionBegin; 1516 info->block_size = 1.0; 1517 info->nz_allocated = a->maxallocmat; 1518 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1519 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1520 info->assemblies = A->num_ass; 1521 info->mallocs = A->info.mallocs; 1522 info->memory = 0; /* REVIEW ME */ 1523 if (A->factortype) { 1524 info->fill_ratio_given = A->info.fill_ratio_given; 1525 info->fill_ratio_needed = A->info.fill_ratio_needed; 1526 info->factor_mallocs = A->info.factor_mallocs; 1527 } else { 1528 info->fill_ratio_given = 0; 1529 info->fill_ratio_needed = 0; 1530 info->factor_mallocs = 0; 1531 } 1532 PetscFunctionReturn(PETSC_SUCCESS); 1533 } 1534 1535 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1536 { 1537 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1538 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1539 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1540 MatScalar *vp, value; 1541 #if defined(PETSC_HAVE_CUDA) 1542 PetscBool inserted = PETSC_FALSE; 1543 PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight; 1544 #endif 1545 1546 PetscFunctionBegin; 1547 for (k = 0; k < m; k++) { /* loop over added rows */ 1548 row = im[k]; 1549 if (row < 0) continue; 1550 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1551 shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */ 1552 cp = a->colidx + shift; /* pointer to the row */ 1553 vp = a->val + shift; /* pointer to the row */ 1554 nrow = a->rlen[row]; 1555 low = 0; 1556 high = nrow; 1557 1558 for (l = 0; l < n; l++) { /* loop over added columns */ 1559 col = in[l]; 1560 if (col < 0) continue; 1561 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1562 if (a->roworiented) { 1563 value = v[l + k * n]; 1564 } else { 1565 value = v[k + l * m]; 1566 } 1567 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1568 1569 /* search in this row for the specified column, i indicates the column to be set */ 1570 if (col <= lastcol) low = 0; 1571 else high = nrow; 1572 lastcol = col; 1573 while (high - low > 5) { 1574 t = (low + high) / 2; 1575 if (*(cp + a->sliceheight * t) > col) high = t; 1576 else low = t; 1577 } 1578 for (i = low; i < high; i++) { 1579 if (*(cp + a->sliceheight * i) > col) break; 1580 if (*(cp + a->sliceheight * i) == col) { 1581 if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value; 1582 else *(vp + a->sliceheight * i) = value; 1583 #if defined(PETSC_HAVE_CUDA) 1584 inserted = PETSC_TRUE; 1585 #endif 1586 low = i + 1; 1587 goto noinsert; 1588 } 1589 } 1590 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1591 if (nonew == 1) goto noinsert; 1592 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1593 #if defined(PETSC_HAVE_CUDA) 1594 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul); 1595 #else 1596 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1597 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1); 1598 #endif 1599 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1600 for (ii = nrow - 1; ii >= i; ii--) { 1601 *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); 1602 *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); 1603 } 1604 a->rlen[row]++; 1605 *(cp + a->sliceheight * i) = col; 1606 *(vp + a->sliceheight * i) = value; 1607 a->nz++; 1608 A->nonzerostate++; 1609 #if defined(PETSC_HAVE_CUDA) 1610 inserted = PETSC_TRUE; 1611 #endif 1612 low = i + 1; 1613 high++; 1614 nrow++; 1615 noinsert:; 1616 } 1617 a->rlen[row] = nrow; 1618 } 1619 #if defined(PETSC_HAVE_CUDA) 1620 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU; 1621 #endif 1622 PetscFunctionReturn(PETSC_SUCCESS); 1623 } 1624 1625 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1626 { 1627 PetscFunctionBegin; 1628 /* If the two matrices have the same copy implementation, use fast copy. */ 1629 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1630 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1631 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1632 1633 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1634 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1635 } else { 1636 PetscCall(MatCopy_Basic(A, B, str)); 1637 } 1638 PetscFunctionReturn(PETSC_SUCCESS); 1639 } 1640 1641 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1642 { 1643 PetscFunctionBegin; 1644 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1645 PetscFunctionReturn(PETSC_SUCCESS); 1646 } 1647 1648 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1649 { 1650 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1651 1652 PetscFunctionBegin; 1653 *array = a->val; 1654 PetscFunctionReturn(PETSC_SUCCESS); 1655 } 1656 1657 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1658 { 1659 PetscFunctionBegin; 1660 PetscFunctionReturn(PETSC_SUCCESS); 1661 } 1662 1663 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1664 { 1665 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1666 PetscInt i; 1667 MatScalar *aval = a->val; 1668 1669 PetscFunctionBegin; 1670 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]); 1671 #if defined(PETSC_HAVE_CUDA) 1672 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1673 #endif 1674 PetscFunctionReturn(PETSC_SUCCESS); 1675 } 1676 1677 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1678 { 1679 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1680 PetscInt i; 1681 MatScalar *aval = a->val; 1682 1683 PetscFunctionBegin; 1684 for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1685 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1686 #if defined(PETSC_HAVE_CUDA) 1687 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1688 #endif 1689 PetscFunctionReturn(PETSC_SUCCESS); 1690 } 1691 1692 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1693 { 1694 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1695 MatScalar *aval = a->val; 1696 PetscScalar oalpha = alpha; 1697 PetscBLASInt one = 1, size; 1698 1699 PetscFunctionBegin; 1700 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1701 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1702 PetscCall(PetscLogFlops(a->nz)); 1703 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1704 #if defined(PETSC_HAVE_CUDA) 1705 if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU; 1706 #endif 1707 PetscFunctionReturn(PETSC_SUCCESS); 1708 } 1709 1710 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1711 { 1712 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1713 1714 PetscFunctionBegin; 1715 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1716 PetscCall(MatShift_Basic(Y, a)); 1717 PetscFunctionReturn(PETSC_SUCCESS); 1718 } 1719 1720 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1721 { 1722 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1723 PetscScalar *x, sum, *t; 1724 const MatScalar *idiag = NULL, *mdiag; 1725 const PetscScalar *b, *xb; 1726 PetscInt n, m = A->rmap->n, i, j, shift; 1727 const PetscInt *diag; 1728 1729 PetscFunctionBegin; 1730 its = its * lits; 1731 1732 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1733 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1734 a->fshift = fshift; 1735 a->omega = omega; 1736 1737 diag = a->diag; 1738 t = a->ssor_work; 1739 idiag = a->idiag; 1740 mdiag = a->mdiag; 1741 1742 PetscCall(VecGetArray(xx, &x)); 1743 PetscCall(VecGetArrayRead(bb, &b)); 1744 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1745 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1746 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1747 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1748 1749 if (flag & SOR_ZERO_INITIAL_GUESS) { 1750 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1751 for (i = 0; i < m; i++) { 1752 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1753 sum = b[i]; 1754 n = (diag[i] - shift) / a->sliceheight; 1755 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1756 t[i] = sum; 1757 x[i] = sum * idiag[i]; 1758 } 1759 xb = t; 1760 PetscCall(PetscLogFlops(a->nz)); 1761 } else xb = b; 1762 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1763 for (i = m - 1; i >= 0; i--) { 1764 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1765 sum = xb[i]; 1766 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1767 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1768 if (xb == b) { 1769 x[i] = sum * idiag[i]; 1770 } else { 1771 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1772 } 1773 } 1774 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1775 } 1776 its--; 1777 } 1778 while (its--) { 1779 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1780 for (i = 0; i < m; i++) { 1781 /* lower */ 1782 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1783 sum = b[i]; 1784 n = (diag[i] - shift) / a->sliceheight; 1785 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1786 t[i] = sum; /* save application of the lower-triangular part */ 1787 /* upper */ 1788 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1789 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1790 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1791 } 1792 xb = t; 1793 PetscCall(PetscLogFlops(2.0 * a->nz)); 1794 } else xb = b; 1795 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1796 for (i = m - 1; i >= 0; i--) { 1797 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1798 sum = xb[i]; 1799 if (xb == b) { 1800 /* whole matrix (no checkpointing available) */ 1801 n = a->rlen[i]; 1802 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1803 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1804 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1805 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1806 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1807 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1808 } 1809 } 1810 if (xb == b) { 1811 PetscCall(PetscLogFlops(2.0 * a->nz)); 1812 } else { 1813 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1814 } 1815 } 1816 } 1817 PetscCall(VecRestoreArray(xx, &x)); 1818 PetscCall(VecRestoreArrayRead(bb, &b)); 1819 PetscFunctionReturn(PETSC_SUCCESS); 1820 } 1821 1822 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1823 MatGetRow_SeqSELL, 1824 MatRestoreRow_SeqSELL, 1825 MatMult_SeqSELL, 1826 /* 4*/ MatMultAdd_SeqSELL, 1827 MatMultTranspose_SeqSELL, 1828 MatMultTransposeAdd_SeqSELL, 1829 NULL, 1830 NULL, 1831 NULL, 1832 /* 10*/ NULL, 1833 NULL, 1834 NULL, 1835 MatSOR_SeqSELL, 1836 NULL, 1837 /* 15*/ MatGetInfo_SeqSELL, 1838 MatEqual_SeqSELL, 1839 MatGetDiagonal_SeqSELL, 1840 MatDiagonalScale_SeqSELL, 1841 NULL, 1842 /* 20*/ NULL, 1843 MatAssemblyEnd_SeqSELL, 1844 MatSetOption_SeqSELL, 1845 MatZeroEntries_SeqSELL, 1846 /* 24*/ NULL, 1847 NULL, 1848 NULL, 1849 NULL, 1850 NULL, 1851 /* 29*/ MatSetUp_SeqSELL, 1852 NULL, 1853 NULL, 1854 NULL, 1855 NULL, 1856 /* 34*/ MatDuplicate_SeqSELL, 1857 NULL, 1858 NULL, 1859 NULL, 1860 NULL, 1861 /* 39*/ NULL, 1862 NULL, 1863 NULL, 1864 MatGetValues_SeqSELL, 1865 MatCopy_SeqSELL, 1866 /* 44*/ NULL, 1867 MatScale_SeqSELL, 1868 MatShift_SeqSELL, 1869 NULL, 1870 NULL, 1871 /* 49*/ NULL, 1872 NULL, 1873 NULL, 1874 NULL, 1875 NULL, 1876 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1877 NULL, 1878 NULL, 1879 NULL, 1880 NULL, 1881 /* 59*/ NULL, 1882 MatDestroy_SeqSELL, 1883 MatView_SeqSELL, 1884 NULL, 1885 NULL, 1886 /* 64*/ NULL, 1887 NULL, 1888 NULL, 1889 NULL, 1890 NULL, 1891 /* 69*/ NULL, 1892 NULL, 1893 NULL, 1894 NULL, 1895 NULL, 1896 /* 74*/ NULL, 1897 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1898 NULL, 1899 NULL, 1900 NULL, 1901 /* 79*/ NULL, 1902 NULL, 1903 NULL, 1904 NULL, 1905 NULL, 1906 /* 84*/ NULL, 1907 NULL, 1908 NULL, 1909 NULL, 1910 NULL, 1911 /* 89*/ NULL, 1912 NULL, 1913 NULL, 1914 NULL, 1915 NULL, 1916 /* 94*/ NULL, 1917 NULL, 1918 NULL, 1919 NULL, 1920 NULL, 1921 /* 99*/ NULL, 1922 NULL, 1923 NULL, 1924 MatConjugate_SeqSELL, 1925 NULL, 1926 /*104*/ NULL, 1927 NULL, 1928 NULL, 1929 NULL, 1930 NULL, 1931 /*109*/ NULL, 1932 NULL, 1933 NULL, 1934 NULL, 1935 MatMissingDiagonal_SeqSELL, 1936 /*114*/ NULL, 1937 NULL, 1938 NULL, 1939 NULL, 1940 NULL, 1941 /*119*/ NULL, 1942 NULL, 1943 NULL, 1944 NULL, 1945 NULL, 1946 /*124*/ NULL, 1947 NULL, 1948 NULL, 1949 NULL, 1950 NULL, 1951 /*129*/ NULL, 1952 NULL, 1953 NULL, 1954 NULL, 1955 NULL, 1956 /*134*/ NULL, 1957 NULL, 1958 NULL, 1959 NULL, 1960 NULL, 1961 /*139*/ NULL, 1962 NULL, 1963 NULL, 1964 MatFDColoringSetUp_SeqXAIJ, 1965 NULL, 1966 /*144*/ NULL, 1967 NULL, 1968 NULL, 1969 NULL, 1970 NULL, 1971 NULL, 1972 /*150*/ NULL, 1973 NULL}; 1974 1975 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1976 { 1977 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1978 1979 PetscFunctionBegin; 1980 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1981 1982 /* allocate space for values if not already there */ 1983 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1984 1985 /* copy values over */ 1986 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1987 PetscFunctionReturn(PETSC_SUCCESS); 1988 } 1989 1990 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1991 { 1992 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1993 1994 PetscFunctionBegin; 1995 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1996 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1997 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1998 PetscFunctionReturn(PETSC_SUCCESS); 1999 } 2000 2001 PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio) 2002 { 2003 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2004 2005 PetscFunctionBegin; 2006 if (a->totalslices && a->sliidx[a->totalslices]) { 2007 *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices]; 2008 } else { 2009 *ratio = 0.0; 2010 } 2011 PetscFunctionReturn(PETSC_SUCCESS); 2012 } 2013 2014 PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth) 2015 { 2016 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2017 PetscInt i, current_slicewidth; 2018 2019 PetscFunctionBegin; 2020 *slicewidth = 0; 2021 for (i = 0; i < a->totalslices; i++) { 2022 current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight; 2023 if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth; 2024 } 2025 PetscFunctionReturn(PETSC_SUCCESS); 2026 } 2027 2028 PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth) 2029 { 2030 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2031 2032 PetscFunctionBegin; 2033 *slicewidth = 0; 2034 if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; } 2035 PetscFunctionReturn(PETSC_SUCCESS); 2036 } 2037 2038 PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance) 2039 { 2040 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 2041 PetscReal mean; 2042 PetscInt i, totalslices = a->totalslices, *sliidx = a->sliidx; 2043 2044 PetscFunctionBegin; 2045 *variance = 0; 2046 if (totalslices) { 2047 mean = (PetscReal)sliidx[totalslices] / totalslices; 2048 for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; } 2049 } 2050 PetscFunctionReturn(PETSC_SUCCESS); 2051 } 2052 2053 PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight) 2054 { 2055 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2056 2057 PetscFunctionBegin; 2058 if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS); 2059 PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight); 2060 a->sliceheight = sliceheight; 2061 #if defined(PETSC_HAVE_CUDA) 2062 PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight); 2063 #endif 2064 PetscFunctionReturn(PETSC_SUCCESS); 2065 } 2066 2067 /*@C 2068 MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()` 2069 2070 Not Collective 2071 2072 Input Parameters: 2073 + A - a `MATSEQSELL` matrix 2074 - array - pointer to the data 2075 2076 Level: intermediate 2077 2078 .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 2079 @*/ 2080 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) 2081 { 2082 PetscFunctionBegin; 2083 PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array)); 2084 PetscFunctionReturn(PETSC_SUCCESS); 2085 } 2086 2087 /*@C 2088 MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix. 2089 2090 Not Collective 2091 2092 Input Parameter: 2093 . A - a MATSEQSELL matrix 2094 2095 Output Parameter: 2096 . ratio - ratio of number of padded zeros to number of allocated elements 2097 2098 Level: intermediate 2099 @*/ 2100 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio) 2101 { 2102 PetscFunctionBegin; 2103 PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio)); 2104 PetscFunctionReturn(PETSC_SUCCESS); 2105 } 2106 2107 /*@C 2108 MatSeqSELLGetMaxSliceWidth - returns the maximum slice width. 2109 2110 Not Collective 2111 2112 Input Parameter: 2113 . A - a MATSEQSELL matrix 2114 2115 Output Parameter: 2116 . slicewidth - maximum slice width 2117 2118 Level: intermediate 2119 @*/ 2120 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth) 2121 { 2122 PetscFunctionBegin; 2123 PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth)); 2124 PetscFunctionReturn(PETSC_SUCCESS); 2125 } 2126 2127 /*@C 2128 MatSeqSELLGetAvgSliceWidth - returns the average slice width. 2129 2130 Not Collective 2131 2132 Input Parameter: 2133 . A - a MATSEQSELL matrix 2134 2135 Output Parameter: 2136 . slicewidth - average slice width 2137 2138 Level: intermediate 2139 @*/ 2140 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth) 2141 { 2142 PetscFunctionBegin; 2143 PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth)); 2144 PetscFunctionReturn(PETSC_SUCCESS); 2145 } 2146 2147 /*@C 2148 MatSeqSELLSetSliceHeight - sets the slice height. 2149 2150 Not Collective 2151 2152 Input Parameters: 2153 + A - a MATSEQSELL matrix 2154 - sliceheight - slice height 2155 2156 Notes: 2157 You cannot change the slice height once it have been set. 2158 2159 The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called. 2160 2161 Level: intermediate 2162 @*/ 2163 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight) 2164 { 2165 PetscFunctionBegin; 2166 PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight)); 2167 PetscFunctionReturn(PETSC_SUCCESS); 2168 } 2169 2170 /*@C 2171 MatSeqSELLGetVarSliceSize - returns the variance of the slice size. 2172 2173 Not Collective 2174 2175 Input Parameter: 2176 . A - a MATSEQSELL matrix 2177 2178 Output Parameter: 2179 . variance - variance of the slice size 2180 2181 Level: intermediate 2182 @*/ 2183 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance) 2184 { 2185 PetscFunctionBegin; 2186 PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance)); 2187 PetscFunctionReturn(PETSC_SUCCESS); 2188 } 2189 2190 #if defined(PETSC_HAVE_CUDA) 2191 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 2192 #endif 2193 2194 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 2195 { 2196 Mat_SeqSELL *b; 2197 PetscMPIInt size; 2198 2199 PetscFunctionBegin; 2200 PetscCall(PetscCitationsRegister(citation, &cited)); 2201 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 2202 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 2203 2204 PetscCall(PetscNew(&b)); 2205 2206 B->data = (void *)b; 2207 B->ops[0] = MatOps_Values; 2208 2209 b->row = NULL; 2210 b->col = NULL; 2211 b->icol = NULL; 2212 b->reallocs = 0; 2213 b->ignorezeroentries = PETSC_FALSE; 2214 b->roworiented = PETSC_TRUE; 2215 b->nonew = 0; 2216 b->diag = NULL; 2217 b->solve_work = NULL; 2218 B->spptr = NULL; 2219 b->saved_values = NULL; 2220 b->idiag = NULL; 2221 b->mdiag = NULL; 2222 b->ssor_work = NULL; 2223 b->omega = 1.0; 2224 b->fshift = 0.0; 2225 b->idiagvalid = PETSC_FALSE; 2226 b->keepnonzeropattern = PETSC_FALSE; 2227 b->sliceheight = 0; 2228 2229 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 2230 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 2231 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 2232 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 2233 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 2234 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 2235 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 2236 #if defined(PETSC_HAVE_CUDA) 2237 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA)); 2238 #endif 2239 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL)); 2240 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL)); 2241 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL)); 2242 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL)); 2243 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL)); 2244 2245 PetscObjectOptionsBegin((PetscObject)B); 2246 { 2247 PetscInt newsh = -1; 2248 PetscBool flg; 2249 #if defined(PETSC_HAVE_CUDA) 2250 PetscInt chunksize = 0; 2251 #endif 2252 2253 PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg)); 2254 if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); } 2255 #if defined(PETSC_HAVE_CUDA) 2256 PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg)); 2257 if (flg) { 2258 PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize); 2259 b->chunksize = chunksize; 2260 } 2261 #endif 2262 } 2263 PetscOptionsEnd(); 2264 PetscFunctionReturn(PETSC_SUCCESS); 2265 } 2266 2267 /* 2268 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 2269 */ 2270 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 2271 { 2272 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 2273 PetscInt i, m = A->rmap->n; 2274 PetscInt totalslices = a->totalslices; 2275 2276 PetscFunctionBegin; 2277 C->factortype = A->factortype; 2278 c->row = NULL; 2279 c->col = NULL; 2280 c->icol = NULL; 2281 c->reallocs = 0; 2282 C->assembled = PETSC_TRUE; 2283 2284 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2285 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2286 2287 PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen)); 2288 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2289 2290 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2291 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2292 2293 /* allocate the matrix space */ 2294 if (mallocmatspace) { 2295 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2296 2297 c->singlemalloc = PETSC_TRUE; 2298 2299 if (m > 0) { 2300 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2301 if (cpvalues == MAT_COPY_VALUES) { 2302 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2303 } else { 2304 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2305 } 2306 } 2307 } 2308 2309 c->ignorezeroentries = a->ignorezeroentries; 2310 c->roworiented = a->roworiented; 2311 c->nonew = a->nonew; 2312 if (a->diag) { 2313 PetscCall(PetscMalloc1(m, &c->diag)); 2314 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2315 } else c->diag = NULL; 2316 2317 c->solve_work = NULL; 2318 c->saved_values = NULL; 2319 c->idiag = NULL; 2320 c->ssor_work = NULL; 2321 c->keepnonzeropattern = a->keepnonzeropattern; 2322 c->free_val = PETSC_TRUE; 2323 c->free_colidx = PETSC_TRUE; 2324 2325 c->maxallocmat = a->maxallocmat; 2326 c->maxallocrow = a->maxallocrow; 2327 c->rlenmax = a->rlenmax; 2328 c->nz = a->nz; 2329 C->preallocated = PETSC_TRUE; 2330 2331 c->nonzerorowcnt = a->nonzerorowcnt; 2332 C->nonzerostate = A->nonzerostate; 2333 2334 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2335 PetscFunctionReturn(PETSC_SUCCESS); 2336 } 2337 2338 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2339 { 2340 PetscFunctionBegin; 2341 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2342 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2343 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2344 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2345 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2346 PetscFunctionReturn(PETSC_SUCCESS); 2347 } 2348 2349 /*MC 2350 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2351 based on the sliced Ellpack format 2352 2353 Options Database Key: 2354 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2355 2356 Level: beginner 2357 2358 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2359 M*/ 2360 2361 /*MC 2362 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2363 2364 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2365 and `MATMPISELL` otherwise. As a result, for single process communicators, 2366 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2367 for communicators controlling multiple processes. It is recommended that you call both of 2368 the above preallocation routines for simplicity. 2369 2370 Options Database Key: 2371 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2372 2373 Level: beginner 2374 2375 Notes: 2376 This format is only supported for real scalars, double precision, and 32-bit indices (the defaults). 2377 2378 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2379 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2380 2381 Developer Notes: 2382 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2383 2384 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2385 .vb 2386 (2 0 3 4) 2387 Consider the matrix A = (5 0 6 0) 2388 (0 0 7 8) 2389 (0 0 9 9) 2390 2391 symbolically the Ellpack format can be written as 2392 2393 (2 3 4 |) (0 2 3 |) 2394 v = (5 6 0 |) colidx = (0 2 2 |) 2395 -------- --------- 2396 (7 8 |) (2 3 |) 2397 (9 9 |) (2 3 |) 2398 2399 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2400 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2401 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2402 2403 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2404 2405 .ve 2406 2407 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2408 2409 References: 2410 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2411 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2412 2413 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2414 M*/ 2415 2416 /*@C 2417 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2418 2419 Collective 2420 2421 Input Parameters: 2422 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2423 . m - number of rows 2424 . n - number of columns 2425 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided 2426 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL 2427 2428 Output Parameter: 2429 . A - the matrix 2430 2431 Level: intermediate 2432 2433 Notes: 2434 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2435 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2436 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2437 2438 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 2439 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 2440 allocation. 2441 2442 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2443 @*/ 2444 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A) 2445 { 2446 PetscFunctionBegin; 2447 PetscCall(MatCreate(comm, A)); 2448 PetscCall(MatSetSizes(*A, m, n, m, n)); 2449 PetscCall(MatSetType(*A, MATSEQSELL)); 2450 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen)); 2451 PetscFunctionReturn(PETSC_SUCCESS); 2452 } 2453 2454 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2455 { 2456 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2457 PetscInt totalslices = a->totalslices; 2458 2459 PetscFunctionBegin; 2460 /* If the matrix dimensions are not equal,or no of nonzeros */ 2461 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2462 *flg = PETSC_FALSE; 2463 PetscFunctionReturn(PETSC_SUCCESS); 2464 } 2465 /* if the a->colidx are the same */ 2466 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2467 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2468 /* if a->val are the same */ 2469 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2470 PetscFunctionReturn(PETSC_SUCCESS); 2471 } 2472 2473 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2474 { 2475 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2476 2477 PetscFunctionBegin; 2478 a->idiagvalid = PETSC_FALSE; 2479 PetscFunctionReturn(PETSC_SUCCESS); 2480 } 2481 2482 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2483 { 2484 #if defined(PETSC_USE_COMPLEX) 2485 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2486 PetscInt i; 2487 PetscScalar *val = a->val; 2488 2489 PetscFunctionBegin; 2490 for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); } 2491 #if defined(PETSC_HAVE_CUDA) 2492 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 2493 #endif 2494 #else 2495 PetscFunctionBegin; 2496 #endif 2497 PetscFunctionReturn(PETSC_SUCCESS); 2498 } 2499