1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = 11 "@inproceedings{ZhangELLPACK2018,\n" 12 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 13 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 14 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 15 " year = 2018\n" 16 "}\n"; 17 18 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 19 20 #include <immintrin.h> 21 22 #if !defined(_MM_SCALE_8) 23 #define _MM_SCALE_8 8 24 #endif 25 26 #if defined(__AVX512F__) 27 /* these do not work 28 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 29 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 30 */ 31 #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 32 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 33 vec_idx = _mm256_loadu_si256((__m256i const*)acolidx); \ 34 vec_vals = _mm512_loadu_pd(aval); \ 35 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \ 36 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y) 37 #elif defined(__AVX2__) && defined(__FMA__) 38 #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 39 vec_vals = _mm256_loadu_pd(aval); \ 40 vec_idx = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \ 41 vec_x = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \ 42 vec_y = _mm256_fmadd_pd(vec_x,vec_vals,vec_y) 43 #endif 44 #endif /* PETSC_HAVE_IMMINTRIN_H */ 45 46 /*@C 47 MatSeqSELLSetPreallocation - For good matrix assembly performance 48 the user should preallocate the matrix storage by setting the parameter nz 49 (or the array nnz). By setting these parameters accurately, performance 50 during matrix assembly can be increased significantly. 51 52 Collective 53 54 Input Parameters: 55 + B - The matrix 56 . nz - number of nonzeros per row (same for all rows) 57 - nnz - array containing the number of nonzeros in the various rows 58 (possibly different for each row) or NULL 59 60 Notes: 61 If nnz is given then nz is ignored. 62 63 Specify the preallocated storage with either nz or nnz (not both). 64 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 65 allocation. For large problems you MUST preallocate memory or you 66 will get TERRIBLE performance, see the users' manual chapter on matrices. 67 68 You can call MatGetInfo() to get information on how effective the preallocation was; 69 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 70 You can also run with the option -info and look for messages with the string 71 malloc in them to see if additional memory allocation was needed. 72 73 Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix 74 entries or columns indices. 75 76 The maximum number of nonzeos in any row should be as accurate as possible. 77 If it is underestimated, you will get bad performance due to reallocation 78 (MatSeqXSELLReallocateSELL). 79 80 Level: intermediate 81 82 .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 83 84 @*/ 85 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[]) 86 { 87 PetscFunctionBegin; 88 PetscValidHeaderSpecific(B,MAT_CLASSID,1); 89 PetscValidType(B,1); 90 PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen)); 91 PetscFunctionReturn(0); 92 } 93 94 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[]) 95 { 96 Mat_SeqSELL *b; 97 PetscInt i,j,totalslices; 98 PetscBool skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE; 99 100 PetscFunctionBegin; 101 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 102 if (maxallocrow == MAT_SKIP_ALLOCATION) { 103 skipallocation = PETSC_TRUE; 104 maxallocrow = 0; 105 } 106 107 PetscCall(PetscLayoutSetUp(B->rmap)); 108 PetscCall(PetscLayoutSetUp(B->cmap)); 109 110 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 111 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 112 PetscCheck(maxallocrow >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %" PetscInt_FMT,maxallocrow); 113 if (rlen) { 114 for (i=0; i<B->rmap->n; i++) { 115 PetscCheck(rlen[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,rlen[i]); 116 PetscCheck(rlen[i] <= B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT,i,rlen[i],B->cmap->n); 117 } 118 } 119 120 B->preallocated = PETSC_TRUE; 121 122 b = (Mat_SeqSELL*)B->data; 123 124 totalslices = PetscCeilInt(B->rmap->n,8); 125 b->totalslices = totalslices; 126 if (!skipallocation) { 127 if (B->rmap->n & 0x07) PetscCall(PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n)); 128 129 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 130 PetscCall(PetscMalloc1(totalslices+1,&b->sliidx)); 131 PetscCall(PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt))); 132 } 133 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 134 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 135 else if (maxallocrow < 0) maxallocrow = 1; 136 for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow; 137 } else { 138 maxallocrow = 0; 139 b->sliidx[0] = 0; 140 for (i=1; i<totalslices; i++) { 141 b->sliidx[i] = 0; 142 for (j=0;j<8;j++) { 143 b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]); 144 } 145 maxallocrow = PetscMax(b->sliidx[i],maxallocrow); 146 PetscCall(PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i])); 147 } 148 /* last slice */ 149 b->sliidx[totalslices] = 0; 150 for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]); 151 maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow); 152 b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices]; 153 } 154 155 /* allocate space for val, colidx, rlen */ 156 /* FIXME: should B's old memory be unlogged? */ 157 PetscCall(MatSeqXSELLFreeSELL(B,&b->val,&b->colidx)); 158 /* FIXME: assuming an element of the bit array takes 8 bits */ 159 PetscCall(PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx)); 160 PetscCall(PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)))); 161 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 162 PetscCall(PetscCalloc1(8*totalslices,&b->rlen)); 163 PetscCall(PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt))); 164 165 b->singlemalloc = PETSC_TRUE; 166 b->free_val = PETSC_TRUE; 167 b->free_colidx = PETSC_TRUE; 168 } else { 169 b->free_val = PETSC_FALSE; 170 b->free_colidx = PETSC_FALSE; 171 } 172 173 b->nz = 0; 174 b->maxallocrow = maxallocrow; 175 b->rlenmax = maxallocrow; 176 b->maxallocmat = b->sliidx[totalslices]; 177 B->info.nz_unneeded = (double)b->maxallocmat; 178 if (realalloc) { 179 PetscCall(MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 180 } 181 PetscFunctionReturn(0); 182 } 183 184 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 185 { 186 Mat_SeqSELL *a = (Mat_SeqSELL*)A->data; 187 PetscInt shift; 188 189 PetscFunctionBegin; 190 PetscCheck(row >= 0 && row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %" PetscInt_FMT " out of range",row); 191 if (nz) *nz = a->rlen[row]; 192 shift = a->sliidx[row>>3]+(row&0x07); 193 if (!a->getrowcols) { 194 195 PetscCall(PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals)); 196 } 197 if (idx) { 198 PetscInt j; 199 for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j]; 200 *idx = a->getrowcols; 201 } 202 if (v) { 203 PetscInt j; 204 for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j]; 205 *v = a->getrowvals; 206 } 207 PetscFunctionReturn(0); 208 } 209 210 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 211 { 212 PetscFunctionBegin; 213 PetscFunctionReturn(0); 214 } 215 216 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat) 217 { 218 Mat B; 219 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 220 PetscInt i; 221 222 PetscFunctionBegin; 223 if (reuse == MAT_REUSE_MATRIX) { 224 B = *newmat; 225 PetscCall(MatZeroEntries(B)); 226 } else { 227 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B)); 228 PetscCall(MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N)); 229 PetscCall(MatSetType(B,MATSEQAIJ)); 230 PetscCall(MatSeqAIJSetPreallocation(B,0,a->rlen)); 231 } 232 233 for (i=0; i<A->rmap->n; i++) { 234 PetscInt nz = 0,*cols = NULL; 235 PetscScalar *vals = NULL; 236 237 PetscCall(MatGetRow_SeqSELL(A,i,&nz,&cols,&vals)); 238 PetscCall(MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES)); 239 PetscCall(MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals)); 240 } 241 242 PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 243 PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 244 B->rmap->bs = A->rmap->bs; 245 246 if (reuse == MAT_INPLACE_MATRIX) { 247 PetscCall(MatHeaderReplace(A,&B)); 248 } else { 249 *newmat = B; 250 } 251 PetscFunctionReturn(0); 252 } 253 254 #include <../src/mat/impls/aij/seq/aij.h> 255 256 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat) 257 { 258 Mat B; 259 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 260 PetscInt *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols; 261 const PetscInt *cols; 262 const PetscScalar *vals; 263 264 PetscFunctionBegin; 265 266 if (reuse == MAT_REUSE_MATRIX) { 267 B = *newmat; 268 } else { 269 if (PetscDefined(USE_DEBUG) || !a->ilen) { 270 PetscCall(PetscMalloc1(m,&rowlengths)); 271 for (i=0; i<m; i++) { 272 rowlengths[i] = ai[i+1] - ai[i]; 273 } 274 } 275 if (PetscDefined(USE_DEBUG) && a->ilen) { 276 PetscBool eq; 277 PetscCall(PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq)); 278 PetscCheck(eq,PETSC_COMM_SELF,PETSC_ERR_PLIB,"SeqAIJ ilen array incorrect"); 279 PetscCall(PetscFree(rowlengths)); 280 rowlengths = a->ilen; 281 } else if (a->ilen) rowlengths = a->ilen; 282 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B)); 283 PetscCall(MatSetSizes(B,m,n,m,n)); 284 PetscCall(MatSetType(B,MATSEQSELL)); 285 PetscCall(MatSeqSELLSetPreallocation(B,0,rowlengths)); 286 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 287 } 288 289 for (row=0; row<m; row++) { 290 PetscCall(MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals)); 291 PetscCall(MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES)); 292 PetscCall(MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals)); 293 } 294 PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 295 PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 296 B->rmap->bs = A->rmap->bs; 297 298 if (reuse == MAT_INPLACE_MATRIX) { 299 PetscCall(MatHeaderReplace(A,&B)); 300 } else { 301 *newmat = B; 302 } 303 PetscFunctionReturn(0); 304 } 305 306 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy) 307 { 308 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 309 PetscScalar *y; 310 const PetscScalar *x; 311 const MatScalar *aval=a->val; 312 PetscInt totalslices=a->totalslices; 313 const PetscInt *acolidx=a->colidx; 314 PetscInt i,j; 315 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 316 __m512d vec_x,vec_y,vec_vals; 317 __m256i vec_idx; 318 __mmask8 mask; 319 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 320 __m256i vec_idx2,vec_idx3,vec_idx4; 321 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 322 __m128i vec_idx; 323 __m256d vec_x,vec_y,vec_y2,vec_vals; 324 MatScalar yval; 325 PetscInt r,rows_left,row,nnz_in_row; 326 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 327 __m128d vec_x_tmp; 328 __m256d vec_x,vec_y,vec_y2,vec_vals; 329 MatScalar yval; 330 PetscInt r,rows_left,row,nnz_in_row; 331 #else 332 PetscScalar sum[8]; 333 #endif 334 335 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 336 #pragma disjoint(*x,*y,*aval) 337 #endif 338 339 PetscFunctionBegin; 340 PetscCall(VecGetArrayRead(xx,&x)); 341 PetscCall(VecGetArray(yy,&y)); 342 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 343 for (i=0; i<totalslices; i++) { /* loop over slices */ 344 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 345 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 346 347 vec_y = _mm512_setzero_pd(); 348 vec_y2 = _mm512_setzero_pd(); 349 vec_y3 = _mm512_setzero_pd(); 350 vec_y4 = _mm512_setzero_pd(); 351 352 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 353 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 354 case 3: 355 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 356 acolidx += 8; aval += 8; 357 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 358 acolidx += 8; aval += 8; 359 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 360 acolidx += 8; aval += 8; 361 j += 3; 362 break; 363 case 2: 364 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 365 acolidx += 8; aval += 8; 366 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 367 acolidx += 8; aval += 8; 368 j += 2; 369 break; 370 case 1: 371 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 372 acolidx += 8; aval += 8; 373 j += 1; 374 break; 375 } 376 #pragma novector 377 for (; j<(a->sliidx[i+1]>>3); j+=4) { 378 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 379 acolidx += 8; aval += 8; 380 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 381 acolidx += 8; aval += 8; 382 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 383 acolidx += 8; aval += 8; 384 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 385 acolidx += 8; aval += 8; 386 } 387 388 vec_y = _mm512_add_pd(vec_y,vec_y2); 389 vec_y = _mm512_add_pd(vec_y,vec_y3); 390 vec_y = _mm512_add_pd(vec_y,vec_y4); 391 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 392 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 393 _mm512_mask_storeu_pd(&y[8*i],mask,vec_y); 394 } else { 395 _mm512_storeu_pd(&y[8*i],vec_y); 396 } 397 } 398 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 399 for (i=0; i<totalslices; i++) { /* loop over full slices */ 400 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 401 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 402 403 /* last slice may have padding rows. Don't use vectorization. */ 404 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 405 rows_left = A->rmap->n - 8*i; 406 for (r=0; r<rows_left; ++r) { 407 yval = (MatScalar)0; 408 row = 8*i + r; 409 nnz_in_row = a->rlen[row]; 410 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 411 y[row] = yval; 412 } 413 break; 414 } 415 416 vec_y = _mm256_setzero_pd(); 417 vec_y2 = _mm256_setzero_pd(); 418 419 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 420 #pragma novector 421 #pragma unroll(2) 422 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 423 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 424 aval += 4; acolidx += 4; 425 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2); 426 aval += 4; acolidx += 4; 427 } 428 429 _mm256_storeu_pd(y+i*8,vec_y); 430 _mm256_storeu_pd(y+i*8+4,vec_y2); 431 } 432 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 433 for (i=0; i<totalslices; i++) { /* loop over full slices */ 434 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 435 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 436 437 vec_y = _mm256_setzero_pd(); 438 vec_y2 = _mm256_setzero_pd(); 439 440 /* last slice may have padding rows. Don't use vectorization. */ 441 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 442 rows_left = A->rmap->n - 8*i; 443 for (r=0; r<rows_left; ++r) { 444 yval = (MatScalar)0; 445 row = 8*i + r; 446 nnz_in_row = a->rlen[row]; 447 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]]; 448 y[row] = yval; 449 } 450 break; 451 } 452 453 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 454 #pragma novector 455 #pragma unroll(2) 456 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 457 vec_vals = _mm256_loadu_pd(aval); 458 vec_x_tmp = _mm_setzero_pd(); 459 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 460 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 461 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 462 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 463 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 464 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 465 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 466 aval += 4; 467 468 vec_vals = _mm256_loadu_pd(aval); 469 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 470 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 471 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 472 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 473 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 474 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 475 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 476 aval += 4; 477 } 478 479 _mm256_storeu_pd(y + i*8, vec_y); 480 _mm256_storeu_pd(y + i*8 + 4, vec_y2); 481 } 482 #else 483 for (i=0; i<totalslices; i++) { /* loop over slices */ 484 for (j=0; j<8; j++) sum[j] = 0.0; 485 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 486 sum[0] += aval[j] * x[acolidx[j]]; 487 sum[1] += aval[j+1] * x[acolidx[j+1]]; 488 sum[2] += aval[j+2] * x[acolidx[j+2]]; 489 sum[3] += aval[j+3] * x[acolidx[j+3]]; 490 sum[4] += aval[j+4] * x[acolidx[j+4]]; 491 sum[5] += aval[j+5] * x[acolidx[j+5]]; 492 sum[6] += aval[j+6] * x[acolidx[j+6]]; 493 sum[7] += aval[j+7] * x[acolidx[j+7]]; 494 } 495 if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 496 for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j]; 497 } else { 498 for (j=0; j<8; j++) y[8*i+j] = sum[j]; 499 } 500 } 501 #endif 502 503 PetscCall(PetscLogFlops(2.0*a->nz-a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 504 PetscCall(VecRestoreArrayRead(xx,&x)); 505 PetscCall(VecRestoreArray(yy,&y)); 506 PetscFunctionReturn(0); 507 } 508 509 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 510 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz) 511 { 512 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 513 PetscScalar *y,*z; 514 const PetscScalar *x; 515 const MatScalar *aval=a->val; 516 PetscInt totalslices=a->totalslices; 517 const PetscInt *acolidx=a->colidx; 518 PetscInt i,j; 519 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 520 __m512d vec_x,vec_y,vec_vals; 521 __m256i vec_idx; 522 __mmask8 mask; 523 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 524 __m256i vec_idx2,vec_idx3,vec_idx4; 525 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 526 __m128d vec_x_tmp; 527 __m256d vec_x,vec_y,vec_y2,vec_vals; 528 MatScalar yval; 529 PetscInt r,row,nnz_in_row; 530 #else 531 PetscScalar sum[8]; 532 #endif 533 534 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 535 #pragma disjoint(*x,*y,*aval) 536 #endif 537 538 PetscFunctionBegin; 539 PetscCall(VecGetArrayRead(xx,&x)); 540 PetscCall(VecGetArrayPair(yy,zz,&y,&z)); 541 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 542 for (i=0; i<totalslices; i++) { /* loop over slices */ 543 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 544 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 545 546 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 547 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 548 vec_y = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]); 549 } else { 550 vec_y = _mm512_loadu_pd(&y[8*i]); 551 } 552 vec_y2 = _mm512_setzero_pd(); 553 vec_y3 = _mm512_setzero_pd(); 554 vec_y4 = _mm512_setzero_pd(); 555 556 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 557 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 558 case 3: 559 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 560 acolidx += 8; aval += 8; 561 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 562 acolidx += 8; aval += 8; 563 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 564 acolidx += 8; aval += 8; 565 j += 3; 566 break; 567 case 2: 568 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 569 acolidx += 8; aval += 8; 570 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 571 acolidx += 8; aval += 8; 572 j += 2; 573 break; 574 case 1: 575 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 576 acolidx += 8; aval += 8; 577 j += 1; 578 break; 579 } 580 #pragma novector 581 for (; j<(a->sliidx[i+1]>>3); j+=4) { 582 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 583 acolidx += 8; aval += 8; 584 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 585 acolidx += 8; aval += 8; 586 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 587 acolidx += 8; aval += 8; 588 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 589 acolidx += 8; aval += 8; 590 } 591 592 vec_y = _mm512_add_pd(vec_y,vec_y2); 593 vec_y = _mm512_add_pd(vec_y,vec_y3); 594 vec_y = _mm512_add_pd(vec_y,vec_y4); 595 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 596 _mm512_mask_storeu_pd(&z[8*i],mask,vec_y); 597 } else { 598 _mm512_storeu_pd(&z[8*i],vec_y); 599 } 600 } 601 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 602 for (i=0; i<totalslices; i++) { /* loop over full slices */ 603 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 604 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 605 606 /* last slice may have padding rows. Don't use vectorization. */ 607 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 608 for (r=0; r<(A->rmap->n & 0x07); ++r) { 609 row = 8*i + r; 610 yval = (MatScalar)0.0; 611 nnz_in_row = a->rlen[row]; 612 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 613 z[row] = y[row] + yval; 614 } 615 break; 616 } 617 618 vec_y = _mm256_loadu_pd(y+8*i); 619 vec_y2 = _mm256_loadu_pd(y+8*i+4); 620 621 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 622 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 623 vec_vals = _mm256_loadu_pd(aval); 624 vec_x_tmp = _mm_setzero_pd(); 625 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 626 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 627 vec_x = _mm256_setzero_pd(); 628 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 629 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 630 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 631 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 632 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 633 aval += 4; 634 635 vec_vals = _mm256_loadu_pd(aval); 636 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 637 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 638 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 639 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 640 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 641 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 642 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 643 aval += 4; 644 } 645 646 _mm256_storeu_pd(z+i*8,vec_y); 647 _mm256_storeu_pd(z+i*8+4,vec_y2); 648 } 649 #else 650 for (i=0; i<totalslices; i++) { /* loop over slices */ 651 for (j=0; j<8; j++) sum[j] = 0.0; 652 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 653 sum[0] += aval[j] * x[acolidx[j]]; 654 sum[1] += aval[j+1] * x[acolidx[j+1]]; 655 sum[2] += aval[j+2] * x[acolidx[j+2]]; 656 sum[3] += aval[j+3] * x[acolidx[j+3]]; 657 sum[4] += aval[j+4] * x[acolidx[j+4]]; 658 sum[5] += aval[j+5] * x[acolidx[j+5]]; 659 sum[6] += aval[j+6] * x[acolidx[j+6]]; 660 sum[7] += aval[j+7] * x[acolidx[j+7]]; 661 } 662 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 663 for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j]; 664 } else { 665 for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j]; 666 } 667 } 668 #endif 669 670 PetscCall(PetscLogFlops(2.0*a->nz)); 671 PetscCall(VecRestoreArrayRead(xx,&x)); 672 PetscCall(VecRestoreArrayPair(yy,zz,&y,&z)); 673 PetscFunctionReturn(0); 674 } 675 676 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy) 677 { 678 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 679 PetscScalar *y; 680 const PetscScalar *x; 681 const MatScalar *aval=a->val; 682 const PetscInt *acolidx=a->colidx; 683 PetscInt i,j,r,row,nnz_in_row,totalslices=a->totalslices; 684 685 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 686 #pragma disjoint(*x,*y,*aval) 687 #endif 688 689 PetscFunctionBegin; 690 if (A->symmetric) { 691 PetscCall(MatMultAdd_SeqSELL(A,xx,zz,yy)); 692 PetscFunctionReturn(0); 693 } 694 if (zz != yy) PetscCall(VecCopy(zz,yy)); 695 PetscCall(VecGetArrayRead(xx,&x)); 696 PetscCall(VecGetArray(yy,&y)); 697 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 698 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 699 for (r=0; r<(A->rmap->n & 0x07); ++r) { 700 row = 8*i + r; 701 nnz_in_row = a->rlen[row]; 702 for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row]; 703 } 704 break; 705 } 706 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 707 y[acolidx[j]] += aval[j] * x[8*i]; 708 y[acolidx[j+1]] += aval[j+1] * x[8*i+1]; 709 y[acolidx[j+2]] += aval[j+2] * x[8*i+2]; 710 y[acolidx[j+3]] += aval[j+3] * x[8*i+3]; 711 y[acolidx[j+4]] += aval[j+4] * x[8*i+4]; 712 y[acolidx[j+5]] += aval[j+5] * x[8*i+5]; 713 y[acolidx[j+6]] += aval[j+6] * x[8*i+6]; 714 y[acolidx[j+7]] += aval[j+7] * x[8*i+7]; 715 } 716 } 717 PetscCall(PetscLogFlops(2.0*a->sliidx[a->totalslices])); 718 PetscCall(VecRestoreArrayRead(xx,&x)); 719 PetscCall(VecRestoreArray(yy,&y)); 720 PetscFunctionReturn(0); 721 } 722 723 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy) 724 { 725 PetscFunctionBegin; 726 if (A->symmetric) { 727 PetscCall(MatMult_SeqSELL(A,xx,yy)); 728 } else { 729 PetscCall(VecSet(yy,0.0)); 730 PetscCall(MatMultTransposeAdd_SeqSELL(A,xx,yy,yy)); 731 } 732 PetscFunctionReturn(0); 733 } 734 735 /* 736 Checks for missing diagonals 737 */ 738 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool *missing,PetscInt *d) 739 { 740 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 741 PetscInt *diag,i; 742 743 PetscFunctionBegin; 744 *missing = PETSC_FALSE; 745 if (A->rmap->n > 0 && !(a->colidx)) { 746 *missing = PETSC_TRUE; 747 if (d) *d = 0; 748 PetscCall(PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n")); 749 } else { 750 diag = a->diag; 751 for (i=0; i<A->rmap->n; i++) { 752 if (diag[i] == -1) { 753 *missing = PETSC_TRUE; 754 if (d) *d = i; 755 PetscCall(PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i)); 756 break; 757 } 758 } 759 } 760 PetscFunctionReturn(0); 761 } 762 763 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 764 { 765 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 766 PetscInt i,j,m=A->rmap->n,shift; 767 768 PetscFunctionBegin; 769 if (!a->diag) { 770 PetscCall(PetscMalloc1(m,&a->diag)); 771 PetscCall(PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt))); 772 a->free_diag = PETSC_TRUE; 773 } 774 for (i=0; i<m; i++) { /* loop over rows */ 775 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 776 a->diag[i] = -1; 777 for (j=0; j<a->rlen[i]; j++) { 778 if (a->colidx[shift+j*8] == i) { 779 a->diag[i] = shift+j*8; 780 break; 781 } 782 } 783 } 784 PetscFunctionReturn(0); 785 } 786 787 /* 788 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 789 */ 790 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift) 791 { 792 Mat_SeqSELL *a=(Mat_SeqSELL*) A->data; 793 PetscInt i,*diag,m = A->rmap->n; 794 MatScalar *val = a->val; 795 PetscScalar *idiag,*mdiag; 796 797 PetscFunctionBegin; 798 if (a->idiagvalid) PetscFunctionReturn(0); 799 PetscCall(MatMarkDiagonal_SeqSELL(A)); 800 diag = a->diag; 801 if (!a->idiag) { 802 PetscCall(PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work)); 803 PetscCall(PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar))); 804 val = a->val; 805 } 806 mdiag = a->mdiag; 807 idiag = a->idiag; 808 809 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 810 for (i=0; i<m; i++) { 811 mdiag[i] = val[diag[i]]; 812 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 813 if (PetscRealPart(fshift)) { 814 PetscCall(PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i)); 815 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 816 A->factorerror_zeropivot_value = 0.0; 817 A->factorerror_zeropivot_row = i; 818 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i); 819 } 820 idiag[i] = 1.0/val[diag[i]]; 821 } 822 PetscCall(PetscLogFlops(m)); 823 } else { 824 for (i=0; i<m; i++) { 825 mdiag[i] = val[diag[i]]; 826 idiag[i] = omega/(fshift + val[diag[i]]); 827 } 828 PetscCall(PetscLogFlops(2.0*m)); 829 } 830 a->idiagvalid = PETSC_TRUE; 831 PetscFunctionReturn(0); 832 } 833 834 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 835 { 836 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 837 838 PetscFunctionBegin; 839 PetscCall(PetscArrayzero(a->val,a->sliidx[a->totalslices])); 840 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 841 PetscFunctionReturn(0); 842 } 843 844 PetscErrorCode MatDestroy_SeqSELL(Mat A) 845 { 846 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 847 848 PetscFunctionBegin; 849 #if defined(PETSC_USE_LOG) 850 PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz); 851 #endif 852 PetscCall(MatSeqXSELLFreeSELL(A,&a->val,&a->colidx)); 853 PetscCall(ISDestroy(&a->row)); 854 PetscCall(ISDestroy(&a->col)); 855 PetscCall(PetscFree(a->diag)); 856 PetscCall(PetscFree(a->rlen)); 857 PetscCall(PetscFree(a->sliidx)); 858 PetscCall(PetscFree3(a->idiag,a->mdiag,a->ssor_work)); 859 PetscCall(PetscFree(a->solve_work)); 860 PetscCall(ISDestroy(&a->icol)); 861 PetscCall(PetscFree(a->saved_values)); 862 PetscCall(PetscFree2(a->getrowcols,a->getrowvals)); 863 864 PetscCall(PetscFree(A->data)); 865 866 PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL)); 867 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL)); 868 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL)); 869 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL)); 870 PetscFunctionReturn(0); 871 } 872 873 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg) 874 { 875 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 876 877 PetscFunctionBegin; 878 switch (op) { 879 case MAT_ROW_ORIENTED: 880 a->roworiented = flg; 881 break; 882 case MAT_KEEP_NONZERO_PATTERN: 883 a->keepnonzeropattern = flg; 884 break; 885 case MAT_NEW_NONZERO_LOCATIONS: 886 a->nonew = (flg ? 0 : 1); 887 break; 888 case MAT_NEW_NONZERO_LOCATION_ERR: 889 a->nonew = (flg ? -1 : 0); 890 break; 891 case MAT_NEW_NONZERO_ALLOCATION_ERR: 892 a->nonew = (flg ? -2 : 0); 893 break; 894 case MAT_UNUSED_NONZERO_LOCATION_ERR: 895 a->nounused = (flg ? -1 : 0); 896 break; 897 case MAT_FORCE_DIAGONAL_ENTRIES: 898 case MAT_IGNORE_OFF_PROC_ENTRIES: 899 case MAT_USE_HASH_TABLE: 900 case MAT_SORTED_FULL: 901 PetscCall(PetscInfo(A,"Option %s ignored\n",MatOptions[op])); 902 break; 903 case MAT_SPD: 904 case MAT_SYMMETRIC: 905 case MAT_STRUCTURALLY_SYMMETRIC: 906 case MAT_HERMITIAN: 907 case MAT_SYMMETRY_ETERNAL: 908 /* These options are handled directly by MatSetOption() */ 909 break; 910 default: 911 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op); 912 } 913 PetscFunctionReturn(0); 914 } 915 916 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v) 917 { 918 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 919 PetscInt i,j,n,shift; 920 PetscScalar *x,zero=0.0; 921 922 PetscFunctionBegin; 923 PetscCall(VecGetLocalSize(v,&n)); 924 PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 925 926 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 927 PetscInt *diag=a->diag; 928 PetscCall(VecGetArray(v,&x)); 929 for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]]; 930 PetscCall(VecRestoreArray(v,&x)); 931 PetscFunctionReturn(0); 932 } 933 934 PetscCall(VecSet(v,zero)); 935 PetscCall(VecGetArray(v,&x)); 936 for (i=0; i<n; i++) { /* loop over rows */ 937 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 938 x[i] = 0; 939 for (j=0; j<a->rlen[i]; j++) { 940 if (a->colidx[shift+j*8] == i) { 941 x[i] = a->val[shift+j*8]; 942 break; 943 } 944 } 945 } 946 PetscCall(VecRestoreArray(v,&x)); 947 PetscFunctionReturn(0); 948 } 949 950 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr) 951 { 952 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 953 const PetscScalar *l,*r; 954 PetscInt i,j,m,n,row; 955 956 PetscFunctionBegin; 957 if (ll) { 958 /* The local size is used so that VecMPI can be passed to this routine 959 by MatDiagonalScale_MPISELL */ 960 PetscCall(VecGetLocalSize(ll,&m)); 961 PetscCheck(m == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 962 PetscCall(VecGetArrayRead(ll,&l)); 963 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 964 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 965 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 966 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row]; 967 } 968 } else { 969 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 970 a->val[j] *= l[8*i+row]; 971 } 972 } 973 } 974 PetscCall(VecRestoreArrayRead(ll,&l)); 975 PetscCall(PetscLogFlops(a->nz)); 976 } 977 if (rr) { 978 PetscCall(VecGetLocalSize(rr,&n)); 979 PetscCheck(n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 980 PetscCall(VecGetArrayRead(rr,&r)); 981 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 982 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 983 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 984 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 985 } 986 } else { 987 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) { 988 a->val[j] *= r[a->colidx[j]]; 989 } 990 } 991 } 992 PetscCall(VecRestoreArrayRead(rr,&r)); 993 PetscCall(PetscLogFlops(a->nz)); 994 } 995 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 996 PetscFunctionReturn(0); 997 } 998 999 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[]) 1000 { 1001 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1002 PetscInt *cp,i,k,low,high,t,row,col,l; 1003 PetscInt shift; 1004 MatScalar *vp; 1005 1006 PetscFunctionBegin; 1007 for (k=0; k<m; k++) { /* loop over requested rows */ 1008 row = im[k]; 1009 if (row<0) continue; 1010 PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1); 1011 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1012 cp = a->colidx+shift; /* pointer to the row */ 1013 vp = a->val+shift; /* pointer to the row */ 1014 for (l=0; l<n; l++) { /* loop over requested columns */ 1015 col = in[l]; 1016 if (col<0) continue; 1017 PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1); 1018 high = a->rlen[row]; low = 0; /* assume unsorted */ 1019 while (high-low > 5) { 1020 t = (low+high)/2; 1021 if (*(cp+t*8) > col) high = t; 1022 else low = t; 1023 } 1024 for (i=low; i<high; i++) { 1025 if (*(cp+8*i) > col) break; 1026 if (*(cp+8*i) == col) { 1027 *v++ = *(vp+8*i); 1028 goto finished; 1029 } 1030 } 1031 *v++ = 0.0; 1032 finished:; 1033 } 1034 } 1035 PetscFunctionReturn(0); 1036 } 1037 1038 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer) 1039 { 1040 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1041 PetscInt i,j,m=A->rmap->n,shift; 1042 const char *name; 1043 PetscViewerFormat format; 1044 1045 PetscFunctionBegin; 1046 PetscCall(PetscViewerGetFormat(viewer,&format)); 1047 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1048 PetscInt nofinalvalue = 0; 1049 /* 1050 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1051 nofinalvalue = 1; 1052 } 1053 */ 1054 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1055 PetscCall(PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n)); 1056 PetscCall(PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz)); 1057 #if defined(PETSC_USE_COMPLEX) 1058 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue)); 1059 #else 1060 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue)); 1061 #endif 1062 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = [\n")); 1063 1064 for (i=0; i<m; i++) { 1065 shift = a->sliidx[i>>3]+(i&0x07); 1066 for (j=0; j<a->rlen[i]; j++) { 1067 #if defined(PETSC_USE_COMPLEX) 1068 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1069 #else 1070 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j])); 1071 #endif 1072 } 1073 } 1074 /* 1075 if (nofinalvalue) { 1076 #if defined(PETSC_USE_COMPLEX) 1077 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1078 #else 1079 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1080 #endif 1081 } 1082 */ 1083 PetscCall(PetscObjectGetName((PetscObject)A,&name)); 1084 PetscCall(PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name)); 1085 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1086 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1087 PetscFunctionReturn(0); 1088 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1089 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1090 for (i=0; i<m; i++) { 1091 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1092 shift = a->sliidx[i>>3]+(i&0x07); 1093 for (j=0; j<a->rlen[i]; j++) { 1094 #if defined(PETSC_USE_COMPLEX) 1095 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1096 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1097 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1098 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]))); 1099 } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) { 1100 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]))); 1101 } 1102 #else 1103 if (a->val[shift+8*j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j])); 1104 #endif 1105 } 1106 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1107 } 1108 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1109 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1110 PetscInt cnt=0,jcnt; 1111 PetscScalar value; 1112 #if defined(PETSC_USE_COMPLEX) 1113 PetscBool realonly=PETSC_TRUE; 1114 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1115 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1116 realonly = PETSC_FALSE; 1117 break; 1118 } 1119 } 1120 #endif 1121 1122 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1123 for (i=0; i<m; i++) { 1124 jcnt = 0; 1125 shift = a->sliidx[i>>3]+(i&0x07); 1126 for (j=0; j<A->cmap->n; j++) { 1127 if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) { 1128 value = a->val[cnt++]; 1129 jcnt++; 1130 } else { 1131 value = 0.0; 1132 } 1133 #if defined(PETSC_USE_COMPLEX) 1134 if (realonly) { 1135 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value))); 1136 } else { 1137 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value))); 1138 } 1139 #else 1140 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value)); 1141 #endif 1142 } 1143 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1144 } 1145 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1146 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1147 PetscInt fshift=1; 1148 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1149 #if defined(PETSC_USE_COMPLEX) 1150 PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n")); 1151 #else 1152 PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n")); 1153 #endif 1154 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1155 for (i=0; i<m; i++) { 1156 shift = a->sliidx[i>>3]+(i&0x07); 1157 for (j=0; j<a->rlen[i]; j++) { 1158 #if defined(PETSC_USE_COMPLEX) 1159 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1160 #else 1161 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j])); 1162 #endif 1163 } 1164 } 1165 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1166 } else if (format == PETSC_VIEWER_NATIVE) { 1167 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 1168 PetscInt row; 1169 PetscCall(PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1])); 1170 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 1171 #if defined(PETSC_USE_COMPLEX) 1172 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1173 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1174 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1175 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]))); 1176 } else { 1177 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]))); 1178 } 1179 #else 1180 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j])); 1181 #endif 1182 } 1183 } 1184 } else { 1185 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1186 if (A->factortype) { 1187 for (i=0; i<m; i++) { 1188 shift = a->sliidx[i>>3]+(i&0x07); 1189 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1190 /* L part */ 1191 for (j=shift; j<a->diag[i]; j+=8) { 1192 #if defined(PETSC_USE_COMPLEX) 1193 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) { 1194 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1195 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) { 1196 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])))); 1197 } else { 1198 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]))); 1199 } 1200 #else 1201 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j])); 1202 #endif 1203 } 1204 /* diagonal */ 1205 j = a->diag[i]; 1206 #if defined(PETSC_USE_COMPLEX) 1207 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1208 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]))); 1209 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1210 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])))); 1211 } else { 1212 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]))); 1213 } 1214 #else 1215 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j]))); 1216 #endif 1217 1218 /* U part */ 1219 for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) { 1220 #if defined(PETSC_USE_COMPLEX) 1221 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1222 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1223 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1224 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])))); 1225 } else { 1226 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]))); 1227 } 1228 #else 1229 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j])); 1230 #endif 1231 } 1232 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1233 } 1234 } else { 1235 for (i=0; i<m; i++) { 1236 shift = a->sliidx[i>>3]+(i&0x07); 1237 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1238 for (j=0; j<a->rlen[i]; j++) { 1239 #if defined(PETSC_USE_COMPLEX) 1240 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1241 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1242 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1243 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]))); 1244 } else { 1245 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]))); 1246 } 1247 #else 1248 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j])); 1249 #endif 1250 } 1251 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1252 } 1253 } 1254 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1255 } 1256 PetscCall(PetscViewerFlush(viewer)); 1257 PetscFunctionReturn(0); 1258 } 1259 1260 #include <petscdraw.h> 1261 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa) 1262 { 1263 Mat A=(Mat)Aa; 1264 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1265 PetscInt i,j,m=A->rmap->n,shift; 1266 int color; 1267 PetscReal xl,yl,xr,yr,x_l,x_r,y_l,y_r; 1268 PetscViewer viewer; 1269 PetscViewerFormat format; 1270 1271 PetscFunctionBegin; 1272 PetscCall(PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer)); 1273 PetscCall(PetscViewerGetFormat(viewer,&format)); 1274 PetscCall(PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr)); 1275 1276 /* loop over matrix elements drawing boxes */ 1277 1278 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1279 PetscDrawCollectiveBegin(draw); 1280 /* Blue for negative, Cyan for zero and Red for positive */ 1281 color = PETSC_DRAW_BLUE; 1282 for (i=0; i<m; i++) { 1283 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1284 y_l = m - i - 1.0; y_r = y_l + 1.0; 1285 for (j=0; j<a->rlen[i]; j++) { 1286 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1287 if (PetscRealPart(a->val[shift+8*j]) >= 0.) continue; 1288 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1289 } 1290 } 1291 color = PETSC_DRAW_CYAN; 1292 for (i=0; i<m; i++) { 1293 shift = a->sliidx[i>>3]+(i&0x07); 1294 y_l = m - i - 1.0; y_r = y_l + 1.0; 1295 for (j=0; j<a->rlen[i]; j++) { 1296 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1297 if (a->val[shift+8*j] != 0.) continue; 1298 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1299 } 1300 } 1301 color = PETSC_DRAW_RED; 1302 for (i=0; i<m; i++) { 1303 shift = a->sliidx[i>>3]+(i&0x07); 1304 y_l = m - i - 1.0; y_r = y_l + 1.0; 1305 for (j=0; j<a->rlen[i]; j++) { 1306 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1307 if (PetscRealPart(a->val[shift+8*j]) <= 0.) continue; 1308 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1309 } 1310 } 1311 PetscDrawCollectiveEnd(draw); 1312 } else { 1313 /* use contour shading to indicate magnitude of values */ 1314 /* first determine max of all nonzero values */ 1315 PetscReal minv=0.0,maxv=0.0; 1316 PetscInt count=0; 1317 PetscDraw popup; 1318 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1319 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1320 } 1321 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1322 PetscCall(PetscDrawGetPopup(draw,&popup)); 1323 PetscCall(PetscDrawScalePopup(popup,minv,maxv)); 1324 1325 PetscDrawCollectiveBegin(draw); 1326 for (i=0; i<m; i++) { 1327 shift = a->sliidx[i>>3]+(i&0x07); 1328 y_l = m - i - 1.0; 1329 y_r = y_l + 1.0; 1330 for (j=0; j<a->rlen[i]; j++) { 1331 x_l = a->colidx[shift+j*8]; 1332 x_r = x_l + 1.0; 1333 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv); 1334 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1335 count++; 1336 } 1337 } 1338 PetscDrawCollectiveEnd(draw); 1339 } 1340 PetscFunctionReturn(0); 1341 } 1342 1343 #include <petscdraw.h> 1344 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer) 1345 { 1346 PetscDraw draw; 1347 PetscReal xr,yr,xl,yl,h,w; 1348 PetscBool isnull; 1349 1350 PetscFunctionBegin; 1351 PetscCall(PetscViewerDrawGetDraw(viewer,0,&draw)); 1352 PetscCall(PetscDrawIsNull(draw,&isnull)); 1353 if (isnull) PetscFunctionReturn(0); 1354 1355 xr = A->cmap->n; yr = A->rmap->n; h = yr/10.0; w = xr/10.0; 1356 xr += w; yr += h; xl = -w; yl = -h; 1357 PetscCall(PetscDrawSetCoordinates(draw,xl,yl,xr,yr)); 1358 PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer)); 1359 PetscCall(PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A)); 1360 PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL)); 1361 PetscCall(PetscDrawSave(draw)); 1362 PetscFunctionReturn(0); 1363 } 1364 1365 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer) 1366 { 1367 PetscBool iascii,isbinary,isdraw; 1368 1369 PetscFunctionBegin; 1370 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii)); 1371 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary)); 1372 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw)); 1373 if (iascii) { 1374 PetscCall(MatView_SeqSELL_ASCII(A,viewer)); 1375 } else if (isbinary) { 1376 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1377 } else if (isdraw) { 1378 PetscCall(MatView_SeqSELL_Draw(A,viewer)); 1379 } 1380 PetscFunctionReturn(0); 1381 } 1382 1383 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode) 1384 { 1385 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1386 PetscInt i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k; 1387 MatScalar *vp; 1388 1389 PetscFunctionBegin; 1390 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1391 /* To do: compress out the unused elements */ 1392 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1393 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz)); 1394 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs)); 1395 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax)); 1396 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1397 for (i=0; i<a->totalslices; ++i) { 1398 shift = a->sliidx[i]; /* starting index of the slice */ 1399 cp = a->colidx+shift; /* pointer to the column indices of the slice */ 1400 vp = a->val+shift; /* pointer to the nonzero values of the slice */ 1401 for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */ 1402 row = 8*i + row_in_slice; 1403 nrow = a->rlen[row]; /* number of nonzeros in row */ 1404 /* 1405 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1406 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1407 */ 1408 lastcol = 0; 1409 if (nrow>0) { /* nonempty row */ 1410 lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */ 1411 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1412 for (j=1;j<8;j++) { 1413 if (a->rlen[8*i+j]) { 1414 lastcol = cp[j]; 1415 break; 1416 } 1417 } 1418 } else { 1419 if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */ 1420 } 1421 1422 for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) { 1423 cp[8*k+row_in_slice] = lastcol; 1424 vp[8*k+row_in_slice] = (MatScalar)0; 1425 } 1426 } 1427 } 1428 1429 A->info.mallocs += a->reallocs; 1430 a->reallocs = 0; 1431 1432 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1433 PetscFunctionReturn(0); 1434 } 1435 1436 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info) 1437 { 1438 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1439 1440 PetscFunctionBegin; 1441 info->block_size = 1.0; 1442 info->nz_allocated = a->maxallocmat; 1443 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1444 info->nz_unneeded = (a->maxallocmat-a->sliidx[a->totalslices]); 1445 info->assemblies = A->num_ass; 1446 info->mallocs = A->info.mallocs; 1447 info->memory = ((PetscObject)A)->mem; 1448 if (A->factortype) { 1449 info->fill_ratio_given = A->info.fill_ratio_given; 1450 info->fill_ratio_needed = A->info.fill_ratio_needed; 1451 info->factor_mallocs = A->info.factor_mallocs; 1452 } else { 1453 info->fill_ratio_given = 0; 1454 info->fill_ratio_needed = 0; 1455 info->factor_mallocs = 0; 1456 } 1457 PetscFunctionReturn(0); 1458 } 1459 1460 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 1461 { 1462 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1463 PetscInt shift,i,k,l,low,high,t,ii,row,col,nrow; 1464 PetscInt *cp,nonew=a->nonew,lastcol=-1; 1465 MatScalar *vp,value; 1466 1467 PetscFunctionBegin; 1468 for (k=0; k<m; k++) { /* loop over added rows */ 1469 row = im[k]; 1470 if (row < 0) continue; 1471 PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1); 1472 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1473 cp = a->colidx+shift; /* pointer to the row */ 1474 vp = a->val+shift; /* pointer to the row */ 1475 nrow = a->rlen[row]; 1476 low = 0; 1477 high = nrow; 1478 1479 for (l=0; l<n; l++) { /* loop over added columns */ 1480 col = in[l]; 1481 if (col<0) continue; 1482 PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1); 1483 if (a->roworiented) { 1484 value = v[l+k*n]; 1485 } else { 1486 value = v[k+l*m]; 1487 } 1488 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1489 1490 /* search in this row for the specified column, i indicates the column to be set */ 1491 if (col <= lastcol) low = 0; 1492 else high = nrow; 1493 lastcol = col; 1494 while (high-low > 5) { 1495 t = (low+high)/2; 1496 if (*(cp+t*8) > col) high = t; 1497 else low = t; 1498 } 1499 for (i=low; i<high; i++) { 1500 if (*(cp+i*8) > col) break; 1501 if (*(cp+i*8) == col) { 1502 if (is == ADD_VALUES) *(vp+i*8) += value; 1503 else *(vp+i*8) = value; 1504 low = i + 1; 1505 goto noinsert; 1506 } 1507 } 1508 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1509 if (nonew == 1) goto noinsert; 1510 PetscCheck(nonew != -1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1511 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1512 MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar); 1513 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1514 for (ii=nrow-1; ii>=i; ii--) { 1515 *(cp+(ii+1)*8) = *(cp+ii*8); 1516 *(vp+(ii+1)*8) = *(vp+ii*8); 1517 } 1518 a->rlen[row]++; 1519 *(cp+i*8) = col; 1520 *(vp+i*8) = value; 1521 a->nz++; 1522 A->nonzerostate++; 1523 low = i+1; high++; nrow++; 1524 noinsert:; 1525 } 1526 a->rlen[row] = nrow; 1527 } 1528 PetscFunctionReturn(0); 1529 } 1530 1531 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str) 1532 { 1533 PetscFunctionBegin; 1534 /* If the two matrices have the same copy implementation, use fast copy. */ 1535 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1536 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1537 Mat_SeqSELL *b=(Mat_SeqSELL*)B->data; 1538 1539 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices],PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different"); 1540 PetscCall(PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices])); 1541 } else { 1542 PetscCall(MatCopy_Basic(A,B,str)); 1543 } 1544 PetscFunctionReturn(0); 1545 } 1546 1547 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1548 { 1549 PetscFunctionBegin; 1550 PetscCall(MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL)); 1551 PetscFunctionReturn(0); 1552 } 1553 1554 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[]) 1555 { 1556 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1557 1558 PetscFunctionBegin; 1559 *array = a->val; 1560 PetscFunctionReturn(0); 1561 } 1562 1563 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[]) 1564 { 1565 PetscFunctionBegin; 1566 PetscFunctionReturn(0); 1567 } 1568 1569 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1570 { 1571 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1572 PetscInt i; 1573 MatScalar *aval=a->val; 1574 1575 PetscFunctionBegin; 1576 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]); 1577 PetscFunctionReturn(0); 1578 } 1579 1580 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1581 { 1582 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1583 PetscInt i; 1584 MatScalar *aval=a->val; 1585 1586 PetscFunctionBegin; 1587 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1588 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1589 PetscFunctionReturn(0); 1590 } 1591 1592 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha) 1593 { 1594 Mat_SeqSELL *a=(Mat_SeqSELL*)inA->data; 1595 MatScalar *aval=a->val; 1596 PetscScalar oalpha=alpha; 1597 PetscBLASInt one=1,size; 1598 1599 PetscFunctionBegin; 1600 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices],&size)); 1601 PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one)); 1602 PetscCall(PetscLogFlops(a->nz)); 1603 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1604 PetscFunctionReturn(0); 1605 } 1606 1607 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a) 1608 { 1609 Mat_SeqSELL *y=(Mat_SeqSELL*)Y->data; 1610 1611 PetscFunctionBegin; 1612 if (!Y->preallocated || !y->nz) { 1613 PetscCall(MatSeqSELLSetPreallocation(Y,1,NULL)); 1614 } 1615 PetscCall(MatShift_Basic(Y,a)); 1616 PetscFunctionReturn(0); 1617 } 1618 1619 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1620 { 1621 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1622 PetscScalar *x,sum,*t; 1623 const MatScalar *idiag=NULL,*mdiag; 1624 const PetscScalar *b,*xb; 1625 PetscInt n,m=A->rmap->n,i,j,shift; 1626 const PetscInt *diag; 1627 1628 PetscFunctionBegin; 1629 its = its*lits; 1630 1631 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1632 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A,omega,fshift)); 1633 a->fshift = fshift; 1634 a->omega = omega; 1635 1636 diag = a->diag; 1637 t = a->ssor_work; 1638 idiag = a->idiag; 1639 mdiag = a->mdiag; 1640 1641 PetscCall(VecGetArray(xx,&x)); 1642 PetscCall(VecGetArrayRead(bb,&b)); 1643 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1644 PetscCheck(flag != SOR_APPLY_UPPER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented"); 1645 PetscCheck(flag != SOR_APPLY_LOWER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented"); 1646 PetscCheck(!(flag & SOR_EISENSTAT),PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat"); 1647 1648 if (flag & SOR_ZERO_INITIAL_GUESS) { 1649 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1650 for (i=0; i<m; i++) { 1651 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1652 sum = b[i]; 1653 n = (diag[i]-shift)/8; 1654 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1655 t[i] = sum; 1656 x[i] = sum*idiag[i]; 1657 } 1658 xb = t; 1659 PetscCall(PetscLogFlops(a->nz)); 1660 } else xb = b; 1661 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1662 for (i=m-1; i>=0; i--) { 1663 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1664 sum = xb[i]; 1665 n = a->rlen[i]-(diag[i]-shift)/8-1; 1666 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1667 if (xb == b) { 1668 x[i] = sum*idiag[i]; 1669 } else { 1670 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1671 } 1672 } 1673 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1674 } 1675 its--; 1676 } 1677 while (its--) { 1678 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1679 for (i=0; i<m; i++) { 1680 /* lower */ 1681 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1682 sum = b[i]; 1683 n = (diag[i]-shift)/8; 1684 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1685 t[i] = sum; /* save application of the lower-triangular part */ 1686 /* upper */ 1687 n = a->rlen[i]-(diag[i]-shift)/8-1; 1688 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1689 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1690 } 1691 xb = t; 1692 PetscCall(PetscLogFlops(2.0*a->nz)); 1693 } else xb = b; 1694 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1695 for (i=m-1; i>=0; i--) { 1696 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1697 sum = xb[i]; 1698 if (xb == b) { 1699 /* whole matrix (no checkpointing available) */ 1700 n = a->rlen[i]; 1701 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1702 x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i]; 1703 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1704 n = a->rlen[i]-(diag[i]-shift)/8-1; 1705 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1706 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1707 } 1708 } 1709 if (xb == b) { 1710 PetscCall(PetscLogFlops(2.0*a->nz)); 1711 } else { 1712 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1713 } 1714 } 1715 } 1716 PetscCall(VecRestoreArray(xx,&x)); 1717 PetscCall(VecRestoreArrayRead(bb,&b)); 1718 PetscFunctionReturn(0); 1719 } 1720 1721 /* -------------------------------------------------------------------*/ 1722 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1723 MatGetRow_SeqSELL, 1724 MatRestoreRow_SeqSELL, 1725 MatMult_SeqSELL, 1726 /* 4*/ MatMultAdd_SeqSELL, 1727 MatMultTranspose_SeqSELL, 1728 MatMultTransposeAdd_SeqSELL, 1729 NULL, 1730 NULL, 1731 NULL, 1732 /* 10*/ NULL, 1733 NULL, 1734 NULL, 1735 MatSOR_SeqSELL, 1736 NULL, 1737 /* 15*/ MatGetInfo_SeqSELL, 1738 MatEqual_SeqSELL, 1739 MatGetDiagonal_SeqSELL, 1740 MatDiagonalScale_SeqSELL, 1741 NULL, 1742 /* 20*/ NULL, 1743 MatAssemblyEnd_SeqSELL, 1744 MatSetOption_SeqSELL, 1745 MatZeroEntries_SeqSELL, 1746 /* 24*/ NULL, 1747 NULL, 1748 NULL, 1749 NULL, 1750 NULL, 1751 /* 29*/ MatSetUp_SeqSELL, 1752 NULL, 1753 NULL, 1754 NULL, 1755 NULL, 1756 /* 34*/ MatDuplicate_SeqSELL, 1757 NULL, 1758 NULL, 1759 NULL, 1760 NULL, 1761 /* 39*/ NULL, 1762 NULL, 1763 NULL, 1764 MatGetValues_SeqSELL, 1765 MatCopy_SeqSELL, 1766 /* 44*/ NULL, 1767 MatScale_SeqSELL, 1768 MatShift_SeqSELL, 1769 NULL, 1770 NULL, 1771 /* 49*/ NULL, 1772 NULL, 1773 NULL, 1774 NULL, 1775 NULL, 1776 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1777 NULL, 1778 NULL, 1779 NULL, 1780 NULL, 1781 /* 59*/ NULL, 1782 MatDestroy_SeqSELL, 1783 MatView_SeqSELL, 1784 NULL, 1785 NULL, 1786 /* 64*/ NULL, 1787 NULL, 1788 NULL, 1789 NULL, 1790 NULL, 1791 /* 69*/ NULL, 1792 NULL, 1793 NULL, 1794 NULL, 1795 NULL, 1796 /* 74*/ NULL, 1797 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1798 NULL, 1799 NULL, 1800 NULL, 1801 /* 79*/ NULL, 1802 NULL, 1803 NULL, 1804 NULL, 1805 NULL, 1806 /* 84*/ NULL, 1807 NULL, 1808 NULL, 1809 NULL, 1810 NULL, 1811 /* 89*/ NULL, 1812 NULL, 1813 NULL, 1814 NULL, 1815 NULL, 1816 /* 94*/ NULL, 1817 NULL, 1818 NULL, 1819 NULL, 1820 NULL, 1821 /* 99*/ NULL, 1822 NULL, 1823 NULL, 1824 MatConjugate_SeqSELL, 1825 NULL, 1826 /*104*/ NULL, 1827 NULL, 1828 NULL, 1829 NULL, 1830 NULL, 1831 /*109*/ NULL, 1832 NULL, 1833 NULL, 1834 NULL, 1835 MatMissingDiagonal_SeqSELL, 1836 /*114*/ NULL, 1837 NULL, 1838 NULL, 1839 NULL, 1840 NULL, 1841 /*119*/ NULL, 1842 NULL, 1843 NULL, 1844 NULL, 1845 NULL, 1846 /*124*/ NULL, 1847 NULL, 1848 NULL, 1849 NULL, 1850 NULL, 1851 /*129*/ NULL, 1852 NULL, 1853 NULL, 1854 NULL, 1855 NULL, 1856 /*134*/ NULL, 1857 NULL, 1858 NULL, 1859 NULL, 1860 NULL, 1861 /*139*/ NULL, 1862 NULL, 1863 NULL, 1864 MatFDColoringSetUp_SeqXAIJ, 1865 NULL, 1866 /*144*/ NULL, 1867 NULL, 1868 NULL, 1869 NULL 1870 }; 1871 1872 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1873 { 1874 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1875 1876 PetscFunctionBegin; 1877 PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1878 1879 /* allocate space for values if not already there */ 1880 if (!a->saved_values) { 1881 PetscCall(PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values)); 1882 PetscCall(PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar))); 1883 } 1884 1885 /* copy values over */ 1886 PetscCall(PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices])); 1887 PetscFunctionReturn(0); 1888 } 1889 1890 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1891 { 1892 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1893 1894 PetscFunctionBegin; 1895 PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1896 PetscCheck(a->saved_values,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first"); 1897 PetscCall(PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices])); 1898 PetscFunctionReturn(0); 1899 } 1900 1901 /*@C 1902 MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray() 1903 1904 Not Collective 1905 1906 Input Parameters: 1907 . mat - a MATSEQSELL matrix 1908 . array - pointer to the data 1909 1910 Level: intermediate 1911 1912 .seealso: `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 1913 @*/ 1914 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array) 1915 { 1916 PetscFunctionBegin; 1917 PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array)); 1918 PetscFunctionReturn(0); 1919 } 1920 1921 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 1922 { 1923 Mat_SeqSELL *b; 1924 PetscMPIInt size; 1925 1926 PetscFunctionBegin; 1927 PetscCall(PetscCitationsRegister(citation,&cited)); 1928 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size)); 1929 PetscCheck(size <= 1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1"); 1930 1931 PetscCall(PetscNewLog(B,&b)); 1932 1933 B->data = (void*)b; 1934 1935 PetscCall(PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps))); 1936 1937 b->row = NULL; 1938 b->col = NULL; 1939 b->icol = NULL; 1940 b->reallocs = 0; 1941 b->ignorezeroentries = PETSC_FALSE; 1942 b->roworiented = PETSC_TRUE; 1943 b->nonew = 0; 1944 b->diag = NULL; 1945 b->solve_work = NULL; 1946 B->spptr = NULL; 1947 b->saved_values = NULL; 1948 b->idiag = NULL; 1949 b->mdiag = NULL; 1950 b->ssor_work = NULL; 1951 b->omega = 1.0; 1952 b->fshift = 0.0; 1953 b->idiagvalid = PETSC_FALSE; 1954 b->keepnonzeropattern = PETSC_FALSE; 1955 1956 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL)); 1957 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL)); 1958 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL)); 1959 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL)); 1960 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL)); 1961 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL)); 1962 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ)); 1963 PetscFunctionReturn(0); 1964 } 1965 1966 /* 1967 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1968 */ 1969 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace) 1970 { 1971 Mat_SeqSELL *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data; 1972 PetscInt i,m=A->rmap->n; 1973 PetscInt totalslices=a->totalslices; 1974 1975 PetscFunctionBegin; 1976 C->factortype = A->factortype; 1977 c->row = NULL; 1978 c->col = NULL; 1979 c->icol = NULL; 1980 c->reallocs = 0; 1981 C->assembled = PETSC_TRUE; 1982 1983 PetscCall(PetscLayoutReference(A->rmap,&C->rmap)); 1984 PetscCall(PetscLayoutReference(A->cmap,&C->cmap)); 1985 1986 PetscCall(PetscMalloc1(8*totalslices,&c->rlen)); 1987 PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt))); 1988 PetscCall(PetscMalloc1(totalslices+1,&c->sliidx)); 1989 PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt))); 1990 1991 for (i=0; i<m; i++) c->rlen[i] = a->rlen[i]; 1992 for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i]; 1993 1994 /* allocate the matrix space */ 1995 if (mallocmatspace) { 1996 PetscCall(PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx)); 1997 PetscCall(PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)))); 1998 1999 c->singlemalloc = PETSC_TRUE; 2000 2001 if (m > 0) { 2002 PetscCall(PetscArraycpy(c->colidx,a->colidx,a->maxallocmat)); 2003 if (cpvalues == MAT_COPY_VALUES) { 2004 PetscCall(PetscArraycpy(c->val,a->val,a->maxallocmat)); 2005 } else { 2006 PetscCall(PetscArrayzero(c->val,a->maxallocmat)); 2007 } 2008 } 2009 } 2010 2011 c->ignorezeroentries = a->ignorezeroentries; 2012 c->roworiented = a->roworiented; 2013 c->nonew = a->nonew; 2014 if (a->diag) { 2015 PetscCall(PetscMalloc1(m,&c->diag)); 2016 PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt))); 2017 for (i=0; i<m; i++) { 2018 c->diag[i] = a->diag[i]; 2019 } 2020 } else c->diag = NULL; 2021 2022 c->solve_work = NULL; 2023 c->saved_values = NULL; 2024 c->idiag = NULL; 2025 c->ssor_work = NULL; 2026 c->keepnonzeropattern = a->keepnonzeropattern; 2027 c->free_val = PETSC_TRUE; 2028 c->free_colidx = PETSC_TRUE; 2029 2030 c->maxallocmat = a->maxallocmat; 2031 c->maxallocrow = a->maxallocrow; 2032 c->rlenmax = a->rlenmax; 2033 c->nz = a->nz; 2034 C->preallocated = PETSC_TRUE; 2035 2036 c->nonzerorowcnt = a->nonzerorowcnt; 2037 C->nonzerostate = A->nonzerostate; 2038 2039 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist)); 2040 PetscFunctionReturn(0); 2041 } 2042 2043 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B) 2044 { 2045 PetscFunctionBegin; 2046 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 2047 PetscCall(MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n)); 2048 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) { 2049 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 2050 } 2051 PetscCall(MatSetType(*B,((PetscObject)A)->type_name)); 2052 PetscCall(MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE)); 2053 PetscFunctionReturn(0); 2054 } 2055 2056 /*MC 2057 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2058 based on the sliced Ellpack format 2059 2060 Options Database Keys: 2061 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions() 2062 2063 Level: beginner 2064 2065 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2066 M*/ 2067 2068 /*MC 2069 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2070 2071 This matrix type is identical to MATSEQSELL when constructed with a single process communicator, 2072 and MATMPISELL otherwise. As a result, for single process communicators, 2073 MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported 2074 for communicators controlling multiple processes. It is recommended that you call both of 2075 the above preallocation routines for simplicity. 2076 2077 Options Database Keys: 2078 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2079 2080 Level: beginner 2081 2082 Notes: 2083 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2084 2085 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2086 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2087 2088 Developer Notes: 2089 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2090 2091 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2092 .vb 2093 (2 0 3 4) 2094 Consider the matrix A = (5 0 6 0) 2095 (0 0 7 8) 2096 (0 0 9 9) 2097 2098 symbolically the Ellpack format can be written as 2099 2100 (2 3 4 |) (0 2 3 |) 2101 v = (5 6 0 |) colidx = (0 2 2 |) 2102 -------- --------- 2103 (7 8 |) (2 3 |) 2104 (9 9 |) (2 3 |) 2105 2106 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2107 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2108 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2109 2110 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2111 2112 .ve 2113 2114 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2115 2116 References: 2117 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2118 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2119 2120 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2121 M*/ 2122 2123 /*@C 2124 MatCreateSeqSELL - Creates a sparse matrix in SELL format. 2125 2126 Collective on comm 2127 2128 Input Parameters: 2129 + comm - MPI communicator, set to PETSC_COMM_SELF 2130 . m - number of rows 2131 . n - number of columns 2132 . rlenmax - maximum number of nonzeros in a row 2133 - rlen - array containing the number of nonzeros in the various rows 2134 (possibly different for each row) or NULL 2135 2136 Output Parameter: 2137 . A - the matrix 2138 2139 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2140 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2141 [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation] 2142 2143 Notes: 2144 If nnz is given then nz is ignored 2145 2146 Specify the preallocated storage with either rlenmax or rlen (not both). 2147 Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory 2148 allocation. For large problems you MUST preallocate memory or you 2149 will get TERRIBLE performance, see the users' manual chapter on matrices. 2150 2151 Level: intermediate 2152 2153 .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2154 2155 @*/ 2156 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A) 2157 { 2158 PetscFunctionBegin; 2159 PetscCall(MatCreate(comm,A)); 2160 PetscCall(MatSetSizes(*A,m,n,m,n)); 2161 PetscCall(MatSetType(*A,MATSEQSELL)); 2162 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen)); 2163 PetscFunctionReturn(0); 2164 } 2165 2166 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg) 2167 { 2168 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data; 2169 PetscInt totalslices=a->totalslices; 2170 2171 PetscFunctionBegin; 2172 /* If the matrix dimensions are not equal,or no of nonzeros */ 2173 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2174 *flg = PETSC_FALSE; 2175 PetscFunctionReturn(0); 2176 } 2177 /* if the a->colidx are the same */ 2178 PetscCall(PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg)); 2179 if (!*flg) PetscFunctionReturn(0); 2180 /* if a->val are the same */ 2181 PetscCall(PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg)); 2182 PetscFunctionReturn(0); 2183 } 2184 2185 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2186 { 2187 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2188 2189 PetscFunctionBegin; 2190 a->idiagvalid = PETSC_FALSE; 2191 PetscFunctionReturn(0); 2192 } 2193 2194 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2195 { 2196 #if defined(PETSC_USE_COMPLEX) 2197 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2198 PetscInt i; 2199 PetscScalar *val = a->val; 2200 2201 PetscFunctionBegin; 2202 for (i=0; i<a->sliidx[a->totalslices]; i++) { 2203 val[i] = PetscConj(val[i]); 2204 } 2205 #else 2206 PetscFunctionBegin; 2207 #endif 2208 PetscFunctionReturn(0); 2209 } 2210