1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = 11 "@inproceedings{ZhangELLPACK2018,\n" 12 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 13 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 14 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 15 " year = 2018\n" 16 "}\n"; 17 18 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 19 20 #include <immintrin.h> 21 22 #if !defined(_MM_SCALE_8) 23 #define _MM_SCALE_8 8 24 #endif 25 26 #if defined(__AVX512F__) 27 /* these do not work 28 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 29 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 30 */ 31 #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 32 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 33 vec_idx = _mm256_loadu_si256((__m256i const*)acolidx); \ 34 vec_vals = _mm512_loadu_pd(aval); \ 35 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \ 36 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y) 37 #elif defined(__AVX2__) && defined(__FMA__) 38 #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 39 vec_vals = _mm256_loadu_pd(aval); \ 40 vec_idx = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \ 41 vec_x = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \ 42 vec_y = _mm256_fmadd_pd(vec_x,vec_vals,vec_y) 43 #endif 44 #endif /* PETSC_HAVE_IMMINTRIN_H */ 45 46 /*@C 47 MatSeqSELLSetPreallocation - For good matrix assembly performance 48 the user should preallocate the matrix storage by setting the parameter nz 49 (or the array nnz). By setting these parameters accurately, performance 50 during matrix assembly can be increased significantly. 51 52 Collective 53 54 Input Parameters: 55 + B - The matrix 56 . nz - number of nonzeros per row (same for all rows) 57 - nnz - array containing the number of nonzeros in the various rows 58 (possibly different for each row) or NULL 59 60 Notes: 61 If nnz is given then nz is ignored. 62 63 Specify the preallocated storage with either nz or nnz (not both). 64 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 65 allocation. For large problems you MUST preallocate memory or you 66 will get TERRIBLE performance, see the users' manual chapter on matrices. 67 68 You can call MatGetInfo() to get information on how effective the preallocation was; 69 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 70 You can also run with the option -info and look for messages with the string 71 malloc in them to see if additional memory allocation was needed. 72 73 Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix 74 entries or columns indices. 75 76 The maximum number of nonzeos in any row should be as accurate as possible. 77 If it is underestimated, you will get bad performance due to reallocation 78 (MatSeqXSELLReallocateSELL). 79 80 Level: intermediate 81 82 .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 83 84 @*/ 85 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[]) 86 { 87 PetscFunctionBegin; 88 PetscValidHeaderSpecific(B,MAT_CLASSID,1); 89 PetscValidType(B,1); 90 PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen)); 91 PetscFunctionReturn(0); 92 } 93 94 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[]) 95 { 96 Mat_SeqSELL *b; 97 PetscInt i,j,totalslices; 98 PetscBool skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE; 99 100 PetscFunctionBegin; 101 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 102 if (maxallocrow == MAT_SKIP_ALLOCATION) { 103 skipallocation = PETSC_TRUE; 104 maxallocrow = 0; 105 } 106 107 PetscCall(PetscLayoutSetUp(B->rmap)); 108 PetscCall(PetscLayoutSetUp(B->cmap)); 109 110 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 111 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 112 PetscCheck(maxallocrow >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %" PetscInt_FMT,maxallocrow); 113 if (rlen) { 114 for (i=0; i<B->rmap->n; i++) { 115 PetscCheck(rlen[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,rlen[i]); 116 PetscCheck(rlen[i] <= B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT,i,rlen[i],B->cmap->n); 117 } 118 } 119 120 B->preallocated = PETSC_TRUE; 121 122 b = (Mat_SeqSELL*)B->data; 123 124 totalslices = PetscCeilInt(B->rmap->n,8); 125 b->totalslices = totalslices; 126 if (!skipallocation) { 127 if (B->rmap->n & 0x07) PetscCall(PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n)); 128 129 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 130 PetscCall(PetscMalloc1(totalslices+1,&b->sliidx)); 131 PetscCall(PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt))); 132 } 133 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 134 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 135 else if (maxallocrow < 0) maxallocrow = 1; 136 for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow; 137 } else { 138 maxallocrow = 0; 139 b->sliidx[0] = 0; 140 for (i=1; i<totalslices; i++) { 141 b->sliidx[i] = 0; 142 for (j=0;j<8;j++) { 143 b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]); 144 } 145 maxallocrow = PetscMax(b->sliidx[i],maxallocrow); 146 PetscCall(PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i])); 147 } 148 /* last slice */ 149 b->sliidx[totalslices] = 0; 150 for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]); 151 maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow); 152 b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices]; 153 } 154 155 /* allocate space for val, colidx, rlen */ 156 /* FIXME: should B's old memory be unlogged? */ 157 PetscCall(MatSeqXSELLFreeSELL(B,&b->val,&b->colidx)); 158 /* FIXME: assuming an element of the bit array takes 8 bits */ 159 PetscCall(PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx)); 160 PetscCall(PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)))); 161 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 162 PetscCall(PetscCalloc1(8*totalslices,&b->rlen)); 163 PetscCall(PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt))); 164 165 b->singlemalloc = PETSC_TRUE; 166 b->free_val = PETSC_TRUE; 167 b->free_colidx = PETSC_TRUE; 168 } else { 169 b->free_val = PETSC_FALSE; 170 b->free_colidx = PETSC_FALSE; 171 } 172 173 b->nz = 0; 174 b->maxallocrow = maxallocrow; 175 b->rlenmax = maxallocrow; 176 b->maxallocmat = b->sliidx[totalslices]; 177 B->info.nz_unneeded = (double)b->maxallocmat; 178 if (realalloc) PetscCall(MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 179 PetscFunctionReturn(0); 180 } 181 182 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 183 { 184 Mat_SeqSELL *a = (Mat_SeqSELL*)A->data; 185 PetscInt shift; 186 187 PetscFunctionBegin; 188 PetscCheck(row >= 0 && row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %" PetscInt_FMT " out of range",row); 189 if (nz) *nz = a->rlen[row]; 190 shift = a->sliidx[row>>3]+(row&0x07); 191 if (!a->getrowcols) { 192 193 PetscCall(PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals)); 194 } 195 if (idx) { 196 PetscInt j; 197 for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j]; 198 *idx = a->getrowcols; 199 } 200 if (v) { 201 PetscInt j; 202 for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j]; 203 *v = a->getrowvals; 204 } 205 PetscFunctionReturn(0); 206 } 207 208 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 209 { 210 PetscFunctionBegin; 211 PetscFunctionReturn(0); 212 } 213 214 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat) 215 { 216 Mat B; 217 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 218 PetscInt i; 219 220 PetscFunctionBegin; 221 if (reuse == MAT_REUSE_MATRIX) { 222 B = *newmat; 223 PetscCall(MatZeroEntries(B)); 224 } else { 225 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B)); 226 PetscCall(MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N)); 227 PetscCall(MatSetType(B,MATSEQAIJ)); 228 PetscCall(MatSeqAIJSetPreallocation(B,0,a->rlen)); 229 } 230 231 for (i=0; i<A->rmap->n; i++) { 232 PetscInt nz = 0,*cols = NULL; 233 PetscScalar *vals = NULL; 234 235 PetscCall(MatGetRow_SeqSELL(A,i,&nz,&cols,&vals)); 236 PetscCall(MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES)); 237 PetscCall(MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals)); 238 } 239 240 PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 241 PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 242 B->rmap->bs = A->rmap->bs; 243 244 if (reuse == MAT_INPLACE_MATRIX) { 245 PetscCall(MatHeaderReplace(A,&B)); 246 } else { 247 *newmat = B; 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #include <../src/mat/impls/aij/seq/aij.h> 253 254 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat) 255 { 256 Mat B; 257 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 258 PetscInt *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols; 259 const PetscInt *cols; 260 const PetscScalar *vals; 261 262 PetscFunctionBegin; 263 264 if (reuse == MAT_REUSE_MATRIX) { 265 B = *newmat; 266 } else { 267 if (PetscDefined(USE_DEBUG) || !a->ilen) { 268 PetscCall(PetscMalloc1(m,&rowlengths)); 269 for (i=0; i<m; i++) { 270 rowlengths[i] = ai[i+1] - ai[i]; 271 } 272 } 273 if (PetscDefined(USE_DEBUG) && a->ilen) { 274 PetscBool eq; 275 PetscCall(PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq)); 276 PetscCheck(eq,PETSC_COMM_SELF,PETSC_ERR_PLIB,"SeqAIJ ilen array incorrect"); 277 PetscCall(PetscFree(rowlengths)); 278 rowlengths = a->ilen; 279 } else if (a->ilen) rowlengths = a->ilen; 280 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B)); 281 PetscCall(MatSetSizes(B,m,n,m,n)); 282 PetscCall(MatSetType(B,MATSEQSELL)); 283 PetscCall(MatSeqSELLSetPreallocation(B,0,rowlengths)); 284 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 285 } 286 287 for (row=0; row<m; row++) { 288 PetscCall(MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals)); 289 PetscCall(MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES)); 290 PetscCall(MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals)); 291 } 292 PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 293 PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 294 B->rmap->bs = A->rmap->bs; 295 296 if (reuse == MAT_INPLACE_MATRIX) { 297 PetscCall(MatHeaderReplace(A,&B)); 298 } else { 299 *newmat = B; 300 } 301 PetscFunctionReturn(0); 302 } 303 304 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy) 305 { 306 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 307 PetscScalar *y; 308 const PetscScalar *x; 309 const MatScalar *aval=a->val; 310 PetscInt totalslices=a->totalslices; 311 const PetscInt *acolidx=a->colidx; 312 PetscInt i,j; 313 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 314 __m512d vec_x,vec_y,vec_vals; 315 __m256i vec_idx; 316 __mmask8 mask; 317 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 318 __m256i vec_idx2,vec_idx3,vec_idx4; 319 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 320 __m128i vec_idx; 321 __m256d vec_x,vec_y,vec_y2,vec_vals; 322 MatScalar yval; 323 PetscInt r,rows_left,row,nnz_in_row; 324 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 325 __m128d vec_x_tmp; 326 __m256d vec_x,vec_y,vec_y2,vec_vals; 327 MatScalar yval; 328 PetscInt r,rows_left,row,nnz_in_row; 329 #else 330 PetscScalar sum[8]; 331 #endif 332 333 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 334 #pragma disjoint(*x,*y,*aval) 335 #endif 336 337 PetscFunctionBegin; 338 PetscCall(VecGetArrayRead(xx,&x)); 339 PetscCall(VecGetArray(yy,&y)); 340 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 341 for (i=0; i<totalslices; i++) { /* loop over slices */ 342 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 343 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 344 345 vec_y = _mm512_setzero_pd(); 346 vec_y2 = _mm512_setzero_pd(); 347 vec_y3 = _mm512_setzero_pd(); 348 vec_y4 = _mm512_setzero_pd(); 349 350 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 351 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 352 case 3: 353 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 354 acolidx += 8; aval += 8; 355 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 356 acolidx += 8; aval += 8; 357 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 358 acolidx += 8; aval += 8; 359 j += 3; 360 break; 361 case 2: 362 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 363 acolidx += 8; aval += 8; 364 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 365 acolidx += 8; aval += 8; 366 j += 2; 367 break; 368 case 1: 369 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 370 acolidx += 8; aval += 8; 371 j += 1; 372 break; 373 } 374 #pragma novector 375 for (; j<(a->sliidx[i+1]>>3); j+=4) { 376 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 377 acolidx += 8; aval += 8; 378 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 379 acolidx += 8; aval += 8; 380 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 381 acolidx += 8; aval += 8; 382 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 383 acolidx += 8; aval += 8; 384 } 385 386 vec_y = _mm512_add_pd(vec_y,vec_y2); 387 vec_y = _mm512_add_pd(vec_y,vec_y3); 388 vec_y = _mm512_add_pd(vec_y,vec_y4); 389 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 390 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 391 _mm512_mask_storeu_pd(&y[8*i],mask,vec_y); 392 } else { 393 _mm512_storeu_pd(&y[8*i],vec_y); 394 } 395 } 396 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 397 for (i=0; i<totalslices; i++) { /* loop over full slices */ 398 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 399 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 400 401 /* last slice may have padding rows. Don't use vectorization. */ 402 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 403 rows_left = A->rmap->n - 8*i; 404 for (r=0; r<rows_left; ++r) { 405 yval = (MatScalar)0; 406 row = 8*i + r; 407 nnz_in_row = a->rlen[row]; 408 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 409 y[row] = yval; 410 } 411 break; 412 } 413 414 vec_y = _mm256_setzero_pd(); 415 vec_y2 = _mm256_setzero_pd(); 416 417 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 418 #pragma novector 419 #pragma unroll(2) 420 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 421 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 422 aval += 4; acolidx += 4; 423 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2); 424 aval += 4; acolidx += 4; 425 } 426 427 _mm256_storeu_pd(y+i*8,vec_y); 428 _mm256_storeu_pd(y+i*8+4,vec_y2); 429 } 430 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 431 for (i=0; i<totalslices; i++) { /* loop over full slices */ 432 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 433 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 434 435 vec_y = _mm256_setzero_pd(); 436 vec_y2 = _mm256_setzero_pd(); 437 438 /* last slice may have padding rows. Don't use vectorization. */ 439 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 440 rows_left = A->rmap->n - 8*i; 441 for (r=0; r<rows_left; ++r) { 442 yval = (MatScalar)0; 443 row = 8*i + r; 444 nnz_in_row = a->rlen[row]; 445 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]]; 446 y[row] = yval; 447 } 448 break; 449 } 450 451 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 452 #pragma novector 453 #pragma unroll(2) 454 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 455 vec_vals = _mm256_loadu_pd(aval); 456 vec_x_tmp = _mm_setzero_pd(); 457 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 458 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 459 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 460 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 461 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 462 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 463 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 464 aval += 4; 465 466 vec_vals = _mm256_loadu_pd(aval); 467 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 468 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 469 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 470 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 471 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 472 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 473 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 474 aval += 4; 475 } 476 477 _mm256_storeu_pd(y + i*8, vec_y); 478 _mm256_storeu_pd(y + i*8 + 4, vec_y2); 479 } 480 #else 481 for (i=0; i<totalslices; i++) { /* loop over slices */ 482 for (j=0; j<8; j++) sum[j] = 0.0; 483 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 484 sum[0] += aval[j] * x[acolidx[j]]; 485 sum[1] += aval[j+1] * x[acolidx[j+1]]; 486 sum[2] += aval[j+2] * x[acolidx[j+2]]; 487 sum[3] += aval[j+3] * x[acolidx[j+3]]; 488 sum[4] += aval[j+4] * x[acolidx[j+4]]; 489 sum[5] += aval[j+5] * x[acolidx[j+5]]; 490 sum[6] += aval[j+6] * x[acolidx[j+6]]; 491 sum[7] += aval[j+7] * x[acolidx[j+7]]; 492 } 493 if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 494 for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j]; 495 } else { 496 for (j=0; j<8; j++) y[8*i+j] = sum[j]; 497 } 498 } 499 #endif 500 501 PetscCall(PetscLogFlops(2.0*a->nz-a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 502 PetscCall(VecRestoreArrayRead(xx,&x)); 503 PetscCall(VecRestoreArray(yy,&y)); 504 PetscFunctionReturn(0); 505 } 506 507 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 508 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz) 509 { 510 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 511 PetscScalar *y,*z; 512 const PetscScalar *x; 513 const MatScalar *aval=a->val; 514 PetscInt totalslices=a->totalslices; 515 const PetscInt *acolidx=a->colidx; 516 PetscInt i,j; 517 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 518 __m512d vec_x,vec_y,vec_vals; 519 __m256i vec_idx; 520 __mmask8 mask; 521 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 522 __m256i vec_idx2,vec_idx3,vec_idx4; 523 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 524 __m128d vec_x_tmp; 525 __m256d vec_x,vec_y,vec_y2,vec_vals; 526 MatScalar yval; 527 PetscInt r,row,nnz_in_row; 528 #else 529 PetscScalar sum[8]; 530 #endif 531 532 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 533 #pragma disjoint(*x,*y,*aval) 534 #endif 535 536 PetscFunctionBegin; 537 PetscCall(VecGetArrayRead(xx,&x)); 538 PetscCall(VecGetArrayPair(yy,zz,&y,&z)); 539 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 540 for (i=0; i<totalslices; i++) { /* loop over slices */ 541 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 542 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 543 544 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 545 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 546 vec_y = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]); 547 } else { 548 vec_y = _mm512_loadu_pd(&y[8*i]); 549 } 550 vec_y2 = _mm512_setzero_pd(); 551 vec_y3 = _mm512_setzero_pd(); 552 vec_y4 = _mm512_setzero_pd(); 553 554 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 555 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 556 case 3: 557 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 558 acolidx += 8; aval += 8; 559 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 560 acolidx += 8; aval += 8; 561 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 562 acolidx += 8; aval += 8; 563 j += 3; 564 break; 565 case 2: 566 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 567 acolidx += 8; aval += 8; 568 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 569 acolidx += 8; aval += 8; 570 j += 2; 571 break; 572 case 1: 573 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 574 acolidx += 8; aval += 8; 575 j += 1; 576 break; 577 } 578 #pragma novector 579 for (; j<(a->sliidx[i+1]>>3); j+=4) { 580 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 581 acolidx += 8; aval += 8; 582 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 583 acolidx += 8; aval += 8; 584 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 585 acolidx += 8; aval += 8; 586 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 587 acolidx += 8; aval += 8; 588 } 589 590 vec_y = _mm512_add_pd(vec_y,vec_y2); 591 vec_y = _mm512_add_pd(vec_y,vec_y3); 592 vec_y = _mm512_add_pd(vec_y,vec_y4); 593 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 594 _mm512_mask_storeu_pd(&z[8*i],mask,vec_y); 595 } else { 596 _mm512_storeu_pd(&z[8*i],vec_y); 597 } 598 } 599 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 600 for (i=0; i<totalslices; i++) { /* loop over full slices */ 601 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 602 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 603 604 /* last slice may have padding rows. Don't use vectorization. */ 605 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 606 for (r=0; r<(A->rmap->n & 0x07); ++r) { 607 row = 8*i + r; 608 yval = (MatScalar)0.0; 609 nnz_in_row = a->rlen[row]; 610 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 611 z[row] = y[row] + yval; 612 } 613 break; 614 } 615 616 vec_y = _mm256_loadu_pd(y+8*i); 617 vec_y2 = _mm256_loadu_pd(y+8*i+4); 618 619 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 620 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 621 vec_vals = _mm256_loadu_pd(aval); 622 vec_x_tmp = _mm_setzero_pd(); 623 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 624 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 625 vec_x = _mm256_setzero_pd(); 626 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 627 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 628 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 629 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 630 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 631 aval += 4; 632 633 vec_vals = _mm256_loadu_pd(aval); 634 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 635 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 636 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 637 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 638 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 639 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 640 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 641 aval += 4; 642 } 643 644 _mm256_storeu_pd(z+i*8,vec_y); 645 _mm256_storeu_pd(z+i*8+4,vec_y2); 646 } 647 #else 648 for (i=0; i<totalslices; i++) { /* loop over slices */ 649 for (j=0; j<8; j++) sum[j] = 0.0; 650 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 651 sum[0] += aval[j] * x[acolidx[j]]; 652 sum[1] += aval[j+1] * x[acolidx[j+1]]; 653 sum[2] += aval[j+2] * x[acolidx[j+2]]; 654 sum[3] += aval[j+3] * x[acolidx[j+3]]; 655 sum[4] += aval[j+4] * x[acolidx[j+4]]; 656 sum[5] += aval[j+5] * x[acolidx[j+5]]; 657 sum[6] += aval[j+6] * x[acolidx[j+6]]; 658 sum[7] += aval[j+7] * x[acolidx[j+7]]; 659 } 660 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 661 for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j]; 662 } else { 663 for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j]; 664 } 665 } 666 #endif 667 668 PetscCall(PetscLogFlops(2.0*a->nz)); 669 PetscCall(VecRestoreArrayRead(xx,&x)); 670 PetscCall(VecRestoreArrayPair(yy,zz,&y,&z)); 671 PetscFunctionReturn(0); 672 } 673 674 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy) 675 { 676 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 677 PetscScalar *y; 678 const PetscScalar *x; 679 const MatScalar *aval=a->val; 680 const PetscInt *acolidx=a->colidx; 681 PetscInt i,j,r,row,nnz_in_row,totalslices=a->totalslices; 682 683 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 684 #pragma disjoint(*x,*y,*aval) 685 #endif 686 687 PetscFunctionBegin; 688 if (A->symmetric == PETSC_BOOL3_TRUE) { 689 PetscCall(MatMultAdd_SeqSELL(A,xx,zz,yy)); 690 PetscFunctionReturn(0); 691 } 692 if (zz != yy) PetscCall(VecCopy(zz,yy)); 693 PetscCall(VecGetArrayRead(xx,&x)); 694 PetscCall(VecGetArray(yy,&y)); 695 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 696 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 697 for (r=0; r<(A->rmap->n & 0x07); ++r) { 698 row = 8*i + r; 699 nnz_in_row = a->rlen[row]; 700 for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row]; 701 } 702 break; 703 } 704 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 705 y[acolidx[j]] += aval[j] * x[8*i]; 706 y[acolidx[j+1]] += aval[j+1] * x[8*i+1]; 707 y[acolidx[j+2]] += aval[j+2] * x[8*i+2]; 708 y[acolidx[j+3]] += aval[j+3] * x[8*i+3]; 709 y[acolidx[j+4]] += aval[j+4] * x[8*i+4]; 710 y[acolidx[j+5]] += aval[j+5] * x[8*i+5]; 711 y[acolidx[j+6]] += aval[j+6] * x[8*i+6]; 712 y[acolidx[j+7]] += aval[j+7] * x[8*i+7]; 713 } 714 } 715 PetscCall(PetscLogFlops(2.0*a->sliidx[a->totalslices])); 716 PetscCall(VecRestoreArrayRead(xx,&x)); 717 PetscCall(VecRestoreArray(yy,&y)); 718 PetscFunctionReturn(0); 719 } 720 721 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy) 722 { 723 PetscFunctionBegin; 724 if (A->symmetric == PETSC_BOOL3_TRUE) { 725 PetscCall(MatMult_SeqSELL(A,xx,yy)); 726 } else { 727 PetscCall(VecSet(yy,0.0)); 728 PetscCall(MatMultTransposeAdd_SeqSELL(A,xx,yy,yy)); 729 } 730 PetscFunctionReturn(0); 731 } 732 733 /* 734 Checks for missing diagonals 735 */ 736 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool *missing,PetscInt *d) 737 { 738 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 739 PetscInt *diag,i; 740 741 PetscFunctionBegin; 742 *missing = PETSC_FALSE; 743 if (A->rmap->n > 0 && !(a->colidx)) { 744 *missing = PETSC_TRUE; 745 if (d) *d = 0; 746 PetscCall(PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n")); 747 } else { 748 diag = a->diag; 749 for (i=0; i<A->rmap->n; i++) { 750 if (diag[i] == -1) { 751 *missing = PETSC_TRUE; 752 if (d) *d = i; 753 PetscCall(PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i)); 754 break; 755 } 756 } 757 } 758 PetscFunctionReturn(0); 759 } 760 761 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 762 { 763 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 764 PetscInt i,j,m=A->rmap->n,shift; 765 766 PetscFunctionBegin; 767 if (!a->diag) { 768 PetscCall(PetscMalloc1(m,&a->diag)); 769 PetscCall(PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt))); 770 a->free_diag = PETSC_TRUE; 771 } 772 for (i=0; i<m; i++) { /* loop over rows */ 773 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 774 a->diag[i] = -1; 775 for (j=0; j<a->rlen[i]; j++) { 776 if (a->colidx[shift+j*8] == i) { 777 a->diag[i] = shift+j*8; 778 break; 779 } 780 } 781 } 782 PetscFunctionReturn(0); 783 } 784 785 /* 786 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 787 */ 788 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift) 789 { 790 Mat_SeqSELL *a=(Mat_SeqSELL*) A->data; 791 PetscInt i,*diag,m = A->rmap->n; 792 MatScalar *val = a->val; 793 PetscScalar *idiag,*mdiag; 794 795 PetscFunctionBegin; 796 if (a->idiagvalid) PetscFunctionReturn(0); 797 PetscCall(MatMarkDiagonal_SeqSELL(A)); 798 diag = a->diag; 799 if (!a->idiag) { 800 PetscCall(PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work)); 801 PetscCall(PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar))); 802 val = a->val; 803 } 804 mdiag = a->mdiag; 805 idiag = a->idiag; 806 807 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 808 for (i=0; i<m; i++) { 809 mdiag[i] = val[diag[i]]; 810 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 811 if (PetscRealPart(fshift)) { 812 PetscCall(PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i)); 813 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 814 A->factorerror_zeropivot_value = 0.0; 815 A->factorerror_zeropivot_row = i; 816 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i); 817 } 818 idiag[i] = 1.0/val[diag[i]]; 819 } 820 PetscCall(PetscLogFlops(m)); 821 } else { 822 for (i=0; i<m; i++) { 823 mdiag[i] = val[diag[i]]; 824 idiag[i] = omega/(fshift + val[diag[i]]); 825 } 826 PetscCall(PetscLogFlops(2.0*m)); 827 } 828 a->idiagvalid = PETSC_TRUE; 829 PetscFunctionReturn(0); 830 } 831 832 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 833 { 834 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 835 836 PetscFunctionBegin; 837 PetscCall(PetscArrayzero(a->val,a->sliidx[a->totalslices])); 838 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 839 PetscFunctionReturn(0); 840 } 841 842 PetscErrorCode MatDestroy_SeqSELL(Mat A) 843 { 844 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 845 846 PetscFunctionBegin; 847 #if defined(PETSC_USE_LOG) 848 PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz); 849 #endif 850 PetscCall(MatSeqXSELLFreeSELL(A,&a->val,&a->colidx)); 851 PetscCall(ISDestroy(&a->row)); 852 PetscCall(ISDestroy(&a->col)); 853 PetscCall(PetscFree(a->diag)); 854 PetscCall(PetscFree(a->rlen)); 855 PetscCall(PetscFree(a->sliidx)); 856 PetscCall(PetscFree3(a->idiag,a->mdiag,a->ssor_work)); 857 PetscCall(PetscFree(a->solve_work)); 858 PetscCall(ISDestroy(&a->icol)); 859 PetscCall(PetscFree(a->saved_values)); 860 PetscCall(PetscFree2(a->getrowcols,a->getrowvals)); 861 862 PetscCall(PetscFree(A->data)); 863 864 PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL)); 865 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL)); 866 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL)); 867 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL)); 868 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLGetArray_C",NULL)); 869 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLRestoreArray_C",NULL)); 870 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqsell_seqaij_C",NULL)); 871 PetscFunctionReturn(0); 872 } 873 874 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg) 875 { 876 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 877 878 PetscFunctionBegin; 879 switch (op) { 880 case MAT_ROW_ORIENTED: 881 a->roworiented = flg; 882 break; 883 case MAT_KEEP_NONZERO_PATTERN: 884 a->keepnonzeropattern = flg; 885 break; 886 case MAT_NEW_NONZERO_LOCATIONS: 887 a->nonew = (flg ? 0 : 1); 888 break; 889 case MAT_NEW_NONZERO_LOCATION_ERR: 890 a->nonew = (flg ? -1 : 0); 891 break; 892 case MAT_NEW_NONZERO_ALLOCATION_ERR: 893 a->nonew = (flg ? -2 : 0); 894 break; 895 case MAT_UNUSED_NONZERO_LOCATION_ERR: 896 a->nounused = (flg ? -1 : 0); 897 break; 898 case MAT_FORCE_DIAGONAL_ENTRIES: 899 case MAT_IGNORE_OFF_PROC_ENTRIES: 900 case MAT_USE_HASH_TABLE: 901 case MAT_SORTED_FULL: 902 PetscCall(PetscInfo(A,"Option %s ignored\n",MatOptions[op])); 903 break; 904 case MAT_SPD: 905 case MAT_SYMMETRIC: 906 case MAT_STRUCTURALLY_SYMMETRIC: 907 case MAT_HERMITIAN: 908 case MAT_SYMMETRY_ETERNAL: 909 case MAT_STRUCTURAL_SYMMETRY_ETERNAL: 910 case MAT_SPD_ETERNAL: 911 /* These options are handled directly by MatSetOption() */ 912 break; 913 default: 914 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op); 915 } 916 PetscFunctionReturn(0); 917 } 918 919 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v) 920 { 921 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 922 PetscInt i,j,n,shift; 923 PetscScalar *x,zero=0.0; 924 925 PetscFunctionBegin; 926 PetscCall(VecGetLocalSize(v,&n)); 927 PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 928 929 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 930 PetscInt *diag=a->diag; 931 PetscCall(VecGetArray(v,&x)); 932 for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]]; 933 PetscCall(VecRestoreArray(v,&x)); 934 PetscFunctionReturn(0); 935 } 936 937 PetscCall(VecSet(v,zero)); 938 PetscCall(VecGetArray(v,&x)); 939 for (i=0; i<n; i++) { /* loop over rows */ 940 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 941 x[i] = 0; 942 for (j=0; j<a->rlen[i]; j++) { 943 if (a->colidx[shift+j*8] == i) { 944 x[i] = a->val[shift+j*8]; 945 break; 946 } 947 } 948 } 949 PetscCall(VecRestoreArray(v,&x)); 950 PetscFunctionReturn(0); 951 } 952 953 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr) 954 { 955 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 956 const PetscScalar *l,*r; 957 PetscInt i,j,m,n,row; 958 959 PetscFunctionBegin; 960 if (ll) { 961 /* The local size is used so that VecMPI can be passed to this routine 962 by MatDiagonalScale_MPISELL */ 963 PetscCall(VecGetLocalSize(ll,&m)); 964 PetscCheck(m == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 965 PetscCall(VecGetArrayRead(ll,&l)); 966 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 967 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 968 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 969 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row]; 970 } 971 } else { 972 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 973 a->val[j] *= l[8*i+row]; 974 } 975 } 976 } 977 PetscCall(VecRestoreArrayRead(ll,&l)); 978 PetscCall(PetscLogFlops(a->nz)); 979 } 980 if (rr) { 981 PetscCall(VecGetLocalSize(rr,&n)); 982 PetscCheck(n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 983 PetscCall(VecGetArrayRead(rr,&r)); 984 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 985 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 986 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 987 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 988 } 989 } else { 990 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) { 991 a->val[j] *= r[a->colidx[j]]; 992 } 993 } 994 } 995 PetscCall(VecRestoreArrayRead(rr,&r)); 996 PetscCall(PetscLogFlops(a->nz)); 997 } 998 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 999 PetscFunctionReturn(0); 1000 } 1001 1002 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[]) 1003 { 1004 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1005 PetscInt *cp,i,k,low,high,t,row,col,l; 1006 PetscInt shift; 1007 MatScalar *vp; 1008 1009 PetscFunctionBegin; 1010 for (k=0; k<m; k++) { /* loop over requested rows */ 1011 row = im[k]; 1012 if (row<0) continue; 1013 PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1); 1014 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1015 cp = a->colidx+shift; /* pointer to the row */ 1016 vp = a->val+shift; /* pointer to the row */ 1017 for (l=0; l<n; l++) { /* loop over requested columns */ 1018 col = in[l]; 1019 if (col<0) continue; 1020 PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1); 1021 high = a->rlen[row]; low = 0; /* assume unsorted */ 1022 while (high-low > 5) { 1023 t = (low+high)/2; 1024 if (*(cp+t*8) > col) high = t; 1025 else low = t; 1026 } 1027 for (i=low; i<high; i++) { 1028 if (*(cp+8*i) > col) break; 1029 if (*(cp+8*i) == col) { 1030 *v++ = *(vp+8*i); 1031 goto finished; 1032 } 1033 } 1034 *v++ = 0.0; 1035 finished:; 1036 } 1037 } 1038 PetscFunctionReturn(0); 1039 } 1040 1041 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer) 1042 { 1043 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1044 PetscInt i,j,m=A->rmap->n,shift; 1045 const char *name; 1046 PetscViewerFormat format; 1047 1048 PetscFunctionBegin; 1049 PetscCall(PetscViewerGetFormat(viewer,&format)); 1050 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1051 PetscInt nofinalvalue = 0; 1052 /* 1053 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1054 nofinalvalue = 1; 1055 } 1056 */ 1057 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1058 PetscCall(PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n)); 1059 PetscCall(PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz)); 1060 #if defined(PETSC_USE_COMPLEX) 1061 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue)); 1062 #else 1063 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue)); 1064 #endif 1065 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = [\n")); 1066 1067 for (i=0; i<m; i++) { 1068 shift = a->sliidx[i>>3]+(i&0x07); 1069 for (j=0; j<a->rlen[i]; j++) { 1070 #if defined(PETSC_USE_COMPLEX) 1071 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1072 #else 1073 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j])); 1074 #endif 1075 } 1076 } 1077 /* 1078 if (nofinalvalue) { 1079 #if defined(PETSC_USE_COMPLEX) 1080 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1081 #else 1082 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1083 #endif 1084 } 1085 */ 1086 PetscCall(PetscObjectGetName((PetscObject)A,&name)); 1087 PetscCall(PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name)); 1088 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1089 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1090 PetscFunctionReturn(0); 1091 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1092 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1093 for (i=0; i<m; i++) { 1094 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1095 shift = a->sliidx[i>>3]+(i&0x07); 1096 for (j=0; j<a->rlen[i]; j++) { 1097 #if defined(PETSC_USE_COMPLEX) 1098 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1099 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1100 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1101 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]))); 1102 } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) { 1103 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]))); 1104 } 1105 #else 1106 if (a->val[shift+8*j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j])); 1107 #endif 1108 } 1109 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1110 } 1111 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1112 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1113 PetscInt cnt=0,jcnt; 1114 PetscScalar value; 1115 #if defined(PETSC_USE_COMPLEX) 1116 PetscBool realonly=PETSC_TRUE; 1117 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1118 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1119 realonly = PETSC_FALSE; 1120 break; 1121 } 1122 } 1123 #endif 1124 1125 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1126 for (i=0; i<m; i++) { 1127 jcnt = 0; 1128 shift = a->sliidx[i>>3]+(i&0x07); 1129 for (j=0; j<A->cmap->n; j++) { 1130 if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) { 1131 value = a->val[cnt++]; 1132 jcnt++; 1133 } else { 1134 value = 0.0; 1135 } 1136 #if defined(PETSC_USE_COMPLEX) 1137 if (realonly) { 1138 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value))); 1139 } else { 1140 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value))); 1141 } 1142 #else 1143 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value)); 1144 #endif 1145 } 1146 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1147 } 1148 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1149 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1150 PetscInt fshift=1; 1151 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1152 #if defined(PETSC_USE_COMPLEX) 1153 PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n")); 1154 #else 1155 PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n")); 1156 #endif 1157 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1158 for (i=0; i<m; i++) { 1159 shift = a->sliidx[i>>3]+(i&0x07); 1160 for (j=0; j<a->rlen[i]; j++) { 1161 #if defined(PETSC_USE_COMPLEX) 1162 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1163 #else 1164 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j])); 1165 #endif 1166 } 1167 } 1168 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1169 } else if (format == PETSC_VIEWER_NATIVE) { 1170 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 1171 PetscInt row; 1172 PetscCall(PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1])); 1173 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 1174 #if defined(PETSC_USE_COMPLEX) 1175 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1176 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1177 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1178 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]))); 1179 } else { 1180 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]))); 1181 } 1182 #else 1183 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j])); 1184 #endif 1185 } 1186 } 1187 } else { 1188 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1189 if (A->factortype) { 1190 for (i=0; i<m; i++) { 1191 shift = a->sliidx[i>>3]+(i&0x07); 1192 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1193 /* L part */ 1194 for (j=shift; j<a->diag[i]; j+=8) { 1195 #if defined(PETSC_USE_COMPLEX) 1196 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) { 1197 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1198 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) { 1199 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])))); 1200 } else { 1201 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]))); 1202 } 1203 #else 1204 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j])); 1205 #endif 1206 } 1207 /* diagonal */ 1208 j = a->diag[i]; 1209 #if defined(PETSC_USE_COMPLEX) 1210 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1211 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]))); 1212 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1213 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])))); 1214 } else { 1215 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]))); 1216 } 1217 #else 1218 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j]))); 1219 #endif 1220 1221 /* U part */ 1222 for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) { 1223 #if defined(PETSC_USE_COMPLEX) 1224 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1225 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1226 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1227 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])))); 1228 } else { 1229 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]))); 1230 } 1231 #else 1232 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j])); 1233 #endif 1234 } 1235 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1236 } 1237 } else { 1238 for (i=0; i<m; i++) { 1239 shift = a->sliidx[i>>3]+(i&0x07); 1240 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1241 for (j=0; j<a->rlen[i]; j++) { 1242 #if defined(PETSC_USE_COMPLEX) 1243 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1244 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1245 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1246 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]))); 1247 } else { 1248 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]))); 1249 } 1250 #else 1251 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j])); 1252 #endif 1253 } 1254 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1255 } 1256 } 1257 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1258 } 1259 PetscCall(PetscViewerFlush(viewer)); 1260 PetscFunctionReturn(0); 1261 } 1262 1263 #include <petscdraw.h> 1264 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa) 1265 { 1266 Mat A=(Mat)Aa; 1267 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1268 PetscInt i,j,m=A->rmap->n,shift; 1269 int color; 1270 PetscReal xl,yl,xr,yr,x_l,x_r,y_l,y_r; 1271 PetscViewer viewer; 1272 PetscViewerFormat format; 1273 1274 PetscFunctionBegin; 1275 PetscCall(PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer)); 1276 PetscCall(PetscViewerGetFormat(viewer,&format)); 1277 PetscCall(PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr)); 1278 1279 /* loop over matrix elements drawing boxes */ 1280 1281 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1282 PetscDrawCollectiveBegin(draw); 1283 /* Blue for negative, Cyan for zero and Red for positive */ 1284 color = PETSC_DRAW_BLUE; 1285 for (i=0; i<m; i++) { 1286 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1287 y_l = m - i - 1.0; y_r = y_l + 1.0; 1288 for (j=0; j<a->rlen[i]; j++) { 1289 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1290 if (PetscRealPart(a->val[shift+8*j]) >= 0.) continue; 1291 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1292 } 1293 } 1294 color = PETSC_DRAW_CYAN; 1295 for (i=0; i<m; i++) { 1296 shift = a->sliidx[i>>3]+(i&0x07); 1297 y_l = m - i - 1.0; y_r = y_l + 1.0; 1298 for (j=0; j<a->rlen[i]; j++) { 1299 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1300 if (a->val[shift+8*j] != 0.) continue; 1301 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1302 } 1303 } 1304 color = PETSC_DRAW_RED; 1305 for (i=0; i<m; i++) { 1306 shift = a->sliidx[i>>3]+(i&0x07); 1307 y_l = m - i - 1.0; y_r = y_l + 1.0; 1308 for (j=0; j<a->rlen[i]; j++) { 1309 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1310 if (PetscRealPart(a->val[shift+8*j]) <= 0.) continue; 1311 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1312 } 1313 } 1314 PetscDrawCollectiveEnd(draw); 1315 } else { 1316 /* use contour shading to indicate magnitude of values */ 1317 /* first determine max of all nonzero values */ 1318 PetscReal minv=0.0,maxv=0.0; 1319 PetscInt count=0; 1320 PetscDraw popup; 1321 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1322 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1323 } 1324 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1325 PetscCall(PetscDrawGetPopup(draw,&popup)); 1326 PetscCall(PetscDrawScalePopup(popup,minv,maxv)); 1327 1328 PetscDrawCollectiveBegin(draw); 1329 for (i=0; i<m; i++) { 1330 shift = a->sliidx[i>>3]+(i&0x07); 1331 y_l = m - i - 1.0; 1332 y_r = y_l + 1.0; 1333 for (j=0; j<a->rlen[i]; j++) { 1334 x_l = a->colidx[shift+j*8]; 1335 x_r = x_l + 1.0; 1336 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv); 1337 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1338 count++; 1339 } 1340 } 1341 PetscDrawCollectiveEnd(draw); 1342 } 1343 PetscFunctionReturn(0); 1344 } 1345 1346 #include <petscdraw.h> 1347 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer) 1348 { 1349 PetscDraw draw; 1350 PetscReal xr,yr,xl,yl,h,w; 1351 PetscBool isnull; 1352 1353 PetscFunctionBegin; 1354 PetscCall(PetscViewerDrawGetDraw(viewer,0,&draw)); 1355 PetscCall(PetscDrawIsNull(draw,&isnull)); 1356 if (isnull) PetscFunctionReturn(0); 1357 1358 xr = A->cmap->n; yr = A->rmap->n; h = yr/10.0; w = xr/10.0; 1359 xr += w; yr += h; xl = -w; yl = -h; 1360 PetscCall(PetscDrawSetCoordinates(draw,xl,yl,xr,yr)); 1361 PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer)); 1362 PetscCall(PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A)); 1363 PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL)); 1364 PetscCall(PetscDrawSave(draw)); 1365 PetscFunctionReturn(0); 1366 } 1367 1368 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer) 1369 { 1370 PetscBool iascii,isbinary,isdraw; 1371 1372 PetscFunctionBegin; 1373 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii)); 1374 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary)); 1375 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw)); 1376 if (iascii) { 1377 PetscCall(MatView_SeqSELL_ASCII(A,viewer)); 1378 } else if (isbinary) { 1379 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1380 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A,viewer)); 1381 PetscFunctionReturn(0); 1382 } 1383 1384 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode) 1385 { 1386 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1387 PetscInt i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k; 1388 MatScalar *vp; 1389 1390 PetscFunctionBegin; 1391 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1392 /* To do: compress out the unused elements */ 1393 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1394 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz)); 1395 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs)); 1396 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax)); 1397 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1398 for (i=0; i<a->totalslices; ++i) { 1399 shift = a->sliidx[i]; /* starting index of the slice */ 1400 cp = a->colidx+shift; /* pointer to the column indices of the slice */ 1401 vp = a->val+shift; /* pointer to the nonzero values of the slice */ 1402 for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */ 1403 row = 8*i + row_in_slice; 1404 nrow = a->rlen[row]; /* number of nonzeros in row */ 1405 /* 1406 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1407 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1408 */ 1409 lastcol = 0; 1410 if (nrow>0) { /* nonempty row */ 1411 lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */ 1412 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1413 for (j=1;j<8;j++) { 1414 if (a->rlen[8*i+j]) { 1415 lastcol = cp[j]; 1416 break; 1417 } 1418 } 1419 } else { 1420 if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */ 1421 } 1422 1423 for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) { 1424 cp[8*k+row_in_slice] = lastcol; 1425 vp[8*k+row_in_slice] = (MatScalar)0; 1426 } 1427 } 1428 } 1429 1430 A->info.mallocs += a->reallocs; 1431 a->reallocs = 0; 1432 1433 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1434 PetscFunctionReturn(0); 1435 } 1436 1437 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info) 1438 { 1439 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1440 1441 PetscFunctionBegin; 1442 info->block_size = 1.0; 1443 info->nz_allocated = a->maxallocmat; 1444 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1445 info->nz_unneeded = (a->maxallocmat-a->sliidx[a->totalslices]); 1446 info->assemblies = A->num_ass; 1447 info->mallocs = A->info.mallocs; 1448 info->memory = ((PetscObject)A)->mem; 1449 if (A->factortype) { 1450 info->fill_ratio_given = A->info.fill_ratio_given; 1451 info->fill_ratio_needed = A->info.fill_ratio_needed; 1452 info->factor_mallocs = A->info.factor_mallocs; 1453 } else { 1454 info->fill_ratio_given = 0; 1455 info->fill_ratio_needed = 0; 1456 info->factor_mallocs = 0; 1457 } 1458 PetscFunctionReturn(0); 1459 } 1460 1461 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 1462 { 1463 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1464 PetscInt shift,i,k,l,low,high,t,ii,row,col,nrow; 1465 PetscInt *cp,nonew=a->nonew,lastcol=-1; 1466 MatScalar *vp,value; 1467 1468 PetscFunctionBegin; 1469 for (k=0; k<m; k++) { /* loop over added rows */ 1470 row = im[k]; 1471 if (row < 0) continue; 1472 PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1); 1473 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1474 cp = a->colidx+shift; /* pointer to the row */ 1475 vp = a->val+shift; /* pointer to the row */ 1476 nrow = a->rlen[row]; 1477 low = 0; 1478 high = nrow; 1479 1480 for (l=0; l<n; l++) { /* loop over added columns */ 1481 col = in[l]; 1482 if (col<0) continue; 1483 PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1); 1484 if (a->roworiented) { 1485 value = v[l+k*n]; 1486 } else { 1487 value = v[k+l*m]; 1488 } 1489 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1490 1491 /* search in this row for the specified column, i indicates the column to be set */ 1492 if (col <= lastcol) low = 0; 1493 else high = nrow; 1494 lastcol = col; 1495 while (high-low > 5) { 1496 t = (low+high)/2; 1497 if (*(cp+t*8) > col) high = t; 1498 else low = t; 1499 } 1500 for (i=low; i<high; i++) { 1501 if (*(cp+i*8) > col) break; 1502 if (*(cp+i*8) == col) { 1503 if (is == ADD_VALUES) *(vp+i*8) += value; 1504 else *(vp+i*8) = value; 1505 low = i + 1; 1506 goto noinsert; 1507 } 1508 } 1509 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1510 if (nonew == 1) goto noinsert; 1511 PetscCheck(nonew != -1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1512 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1513 MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar); 1514 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1515 for (ii=nrow-1; ii>=i; ii--) { 1516 *(cp+(ii+1)*8) = *(cp+ii*8); 1517 *(vp+(ii+1)*8) = *(vp+ii*8); 1518 } 1519 a->rlen[row]++; 1520 *(cp+i*8) = col; 1521 *(vp+i*8) = value; 1522 a->nz++; 1523 A->nonzerostate++; 1524 low = i+1; high++; nrow++; 1525 noinsert:; 1526 } 1527 a->rlen[row] = nrow; 1528 } 1529 PetscFunctionReturn(0); 1530 } 1531 1532 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str) 1533 { 1534 PetscFunctionBegin; 1535 /* If the two matrices have the same copy implementation, use fast copy. */ 1536 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1537 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1538 Mat_SeqSELL *b=(Mat_SeqSELL*)B->data; 1539 1540 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices],PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different"); 1541 PetscCall(PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices])); 1542 } else { 1543 PetscCall(MatCopy_Basic(A,B,str)); 1544 } 1545 PetscFunctionReturn(0); 1546 } 1547 1548 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1549 { 1550 PetscFunctionBegin; 1551 PetscCall(MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL)); 1552 PetscFunctionReturn(0); 1553 } 1554 1555 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[]) 1556 { 1557 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1558 1559 PetscFunctionBegin; 1560 *array = a->val; 1561 PetscFunctionReturn(0); 1562 } 1563 1564 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[]) 1565 { 1566 PetscFunctionBegin; 1567 PetscFunctionReturn(0); 1568 } 1569 1570 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1571 { 1572 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1573 PetscInt i; 1574 MatScalar *aval=a->val; 1575 1576 PetscFunctionBegin; 1577 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]); 1578 PetscFunctionReturn(0); 1579 } 1580 1581 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1582 { 1583 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1584 PetscInt i; 1585 MatScalar *aval=a->val; 1586 1587 PetscFunctionBegin; 1588 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1589 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1590 PetscFunctionReturn(0); 1591 } 1592 1593 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha) 1594 { 1595 Mat_SeqSELL *a=(Mat_SeqSELL*)inA->data; 1596 MatScalar *aval=a->val; 1597 PetscScalar oalpha=alpha; 1598 PetscBLASInt one=1,size; 1599 1600 PetscFunctionBegin; 1601 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices],&size)); 1602 PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one)); 1603 PetscCall(PetscLogFlops(a->nz)); 1604 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1605 PetscFunctionReturn(0); 1606 } 1607 1608 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a) 1609 { 1610 Mat_SeqSELL *y=(Mat_SeqSELL*)Y->data; 1611 1612 PetscFunctionBegin; 1613 if (!Y->preallocated || !y->nz) { 1614 PetscCall(MatSeqSELLSetPreallocation(Y,1,NULL)); 1615 } 1616 PetscCall(MatShift_Basic(Y,a)); 1617 PetscFunctionReturn(0); 1618 } 1619 1620 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1621 { 1622 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1623 PetscScalar *x,sum,*t; 1624 const MatScalar *idiag=NULL,*mdiag; 1625 const PetscScalar *b,*xb; 1626 PetscInt n,m=A->rmap->n,i,j,shift; 1627 const PetscInt *diag; 1628 1629 PetscFunctionBegin; 1630 its = its*lits; 1631 1632 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1633 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A,omega,fshift)); 1634 a->fshift = fshift; 1635 a->omega = omega; 1636 1637 diag = a->diag; 1638 t = a->ssor_work; 1639 idiag = a->idiag; 1640 mdiag = a->mdiag; 1641 1642 PetscCall(VecGetArray(xx,&x)); 1643 PetscCall(VecGetArrayRead(bb,&b)); 1644 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1645 PetscCheck(flag != SOR_APPLY_UPPER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented"); 1646 PetscCheck(flag != SOR_APPLY_LOWER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented"); 1647 PetscCheck(!(flag & SOR_EISENSTAT),PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat"); 1648 1649 if (flag & SOR_ZERO_INITIAL_GUESS) { 1650 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1651 for (i=0; i<m; i++) { 1652 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1653 sum = b[i]; 1654 n = (diag[i]-shift)/8; 1655 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1656 t[i] = sum; 1657 x[i] = sum*idiag[i]; 1658 } 1659 xb = t; 1660 PetscCall(PetscLogFlops(a->nz)); 1661 } else xb = b; 1662 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1663 for (i=m-1; i>=0; i--) { 1664 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1665 sum = xb[i]; 1666 n = a->rlen[i]-(diag[i]-shift)/8-1; 1667 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1668 if (xb == b) { 1669 x[i] = sum*idiag[i]; 1670 } else { 1671 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1672 } 1673 } 1674 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1675 } 1676 its--; 1677 } 1678 while (its--) { 1679 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1680 for (i=0; i<m; i++) { 1681 /* lower */ 1682 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1683 sum = b[i]; 1684 n = (diag[i]-shift)/8; 1685 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1686 t[i] = sum; /* save application of the lower-triangular part */ 1687 /* upper */ 1688 n = a->rlen[i]-(diag[i]-shift)/8-1; 1689 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1690 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1691 } 1692 xb = t; 1693 PetscCall(PetscLogFlops(2.0*a->nz)); 1694 } else xb = b; 1695 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1696 for (i=m-1; i>=0; i--) { 1697 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1698 sum = xb[i]; 1699 if (xb == b) { 1700 /* whole matrix (no checkpointing available) */ 1701 n = a->rlen[i]; 1702 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1703 x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i]; 1704 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1705 n = a->rlen[i]-(diag[i]-shift)/8-1; 1706 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1707 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1708 } 1709 } 1710 if (xb == b) { 1711 PetscCall(PetscLogFlops(2.0*a->nz)); 1712 } else { 1713 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1714 } 1715 } 1716 } 1717 PetscCall(VecRestoreArray(xx,&x)); 1718 PetscCall(VecRestoreArrayRead(bb,&b)); 1719 PetscFunctionReturn(0); 1720 } 1721 1722 /* -------------------------------------------------------------------*/ 1723 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1724 MatGetRow_SeqSELL, 1725 MatRestoreRow_SeqSELL, 1726 MatMult_SeqSELL, 1727 /* 4*/ MatMultAdd_SeqSELL, 1728 MatMultTranspose_SeqSELL, 1729 MatMultTransposeAdd_SeqSELL, 1730 NULL, 1731 NULL, 1732 NULL, 1733 /* 10*/ NULL, 1734 NULL, 1735 NULL, 1736 MatSOR_SeqSELL, 1737 NULL, 1738 /* 15*/ MatGetInfo_SeqSELL, 1739 MatEqual_SeqSELL, 1740 MatGetDiagonal_SeqSELL, 1741 MatDiagonalScale_SeqSELL, 1742 NULL, 1743 /* 20*/ NULL, 1744 MatAssemblyEnd_SeqSELL, 1745 MatSetOption_SeqSELL, 1746 MatZeroEntries_SeqSELL, 1747 /* 24*/ NULL, 1748 NULL, 1749 NULL, 1750 NULL, 1751 NULL, 1752 /* 29*/ MatSetUp_SeqSELL, 1753 NULL, 1754 NULL, 1755 NULL, 1756 NULL, 1757 /* 34*/ MatDuplicate_SeqSELL, 1758 NULL, 1759 NULL, 1760 NULL, 1761 NULL, 1762 /* 39*/ NULL, 1763 NULL, 1764 NULL, 1765 MatGetValues_SeqSELL, 1766 MatCopy_SeqSELL, 1767 /* 44*/ NULL, 1768 MatScale_SeqSELL, 1769 MatShift_SeqSELL, 1770 NULL, 1771 NULL, 1772 /* 49*/ NULL, 1773 NULL, 1774 NULL, 1775 NULL, 1776 NULL, 1777 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1778 NULL, 1779 NULL, 1780 NULL, 1781 NULL, 1782 /* 59*/ NULL, 1783 MatDestroy_SeqSELL, 1784 MatView_SeqSELL, 1785 NULL, 1786 NULL, 1787 /* 64*/ NULL, 1788 NULL, 1789 NULL, 1790 NULL, 1791 NULL, 1792 /* 69*/ NULL, 1793 NULL, 1794 NULL, 1795 NULL, 1796 NULL, 1797 /* 74*/ NULL, 1798 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1799 NULL, 1800 NULL, 1801 NULL, 1802 /* 79*/ NULL, 1803 NULL, 1804 NULL, 1805 NULL, 1806 NULL, 1807 /* 84*/ NULL, 1808 NULL, 1809 NULL, 1810 NULL, 1811 NULL, 1812 /* 89*/ NULL, 1813 NULL, 1814 NULL, 1815 NULL, 1816 NULL, 1817 /* 94*/ NULL, 1818 NULL, 1819 NULL, 1820 NULL, 1821 NULL, 1822 /* 99*/ NULL, 1823 NULL, 1824 NULL, 1825 MatConjugate_SeqSELL, 1826 NULL, 1827 /*104*/ NULL, 1828 NULL, 1829 NULL, 1830 NULL, 1831 NULL, 1832 /*109*/ NULL, 1833 NULL, 1834 NULL, 1835 NULL, 1836 MatMissingDiagonal_SeqSELL, 1837 /*114*/ NULL, 1838 NULL, 1839 NULL, 1840 NULL, 1841 NULL, 1842 /*119*/ NULL, 1843 NULL, 1844 NULL, 1845 NULL, 1846 NULL, 1847 /*124*/ NULL, 1848 NULL, 1849 NULL, 1850 NULL, 1851 NULL, 1852 /*129*/ NULL, 1853 NULL, 1854 NULL, 1855 NULL, 1856 NULL, 1857 /*134*/ NULL, 1858 NULL, 1859 NULL, 1860 NULL, 1861 NULL, 1862 /*139*/ NULL, 1863 NULL, 1864 NULL, 1865 MatFDColoringSetUp_SeqXAIJ, 1866 NULL, 1867 /*144*/ NULL, 1868 NULL, 1869 NULL, 1870 NULL, 1871 NULL, 1872 NULL 1873 }; 1874 1875 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1876 { 1877 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1878 1879 PetscFunctionBegin; 1880 PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1881 1882 /* allocate space for values if not already there */ 1883 if (!a->saved_values) { 1884 PetscCall(PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values)); 1885 PetscCall(PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar))); 1886 } 1887 1888 /* copy values over */ 1889 PetscCall(PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices])); 1890 PetscFunctionReturn(0); 1891 } 1892 1893 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1894 { 1895 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1896 1897 PetscFunctionBegin; 1898 PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1899 PetscCheck(a->saved_values,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first"); 1900 PetscCall(PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices])); 1901 PetscFunctionReturn(0); 1902 } 1903 1904 /*@C 1905 MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray() 1906 1907 Not Collective 1908 1909 Input Parameters: 1910 . mat - a MATSEQSELL matrix 1911 . array - pointer to the data 1912 1913 Level: intermediate 1914 1915 .seealso: `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()` 1916 @*/ 1917 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array) 1918 { 1919 PetscFunctionBegin; 1920 PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array)); 1921 PetscFunctionReturn(0); 1922 } 1923 1924 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 1925 { 1926 Mat_SeqSELL *b; 1927 PetscMPIInt size; 1928 1929 PetscFunctionBegin; 1930 PetscCall(PetscCitationsRegister(citation,&cited)); 1931 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size)); 1932 PetscCheck(size <= 1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1"); 1933 1934 PetscCall(PetscNewLog(B,&b)); 1935 1936 B->data = (void*)b; 1937 1938 PetscCall(PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps))); 1939 1940 b->row = NULL; 1941 b->col = NULL; 1942 b->icol = NULL; 1943 b->reallocs = 0; 1944 b->ignorezeroentries = PETSC_FALSE; 1945 b->roworiented = PETSC_TRUE; 1946 b->nonew = 0; 1947 b->diag = NULL; 1948 b->solve_work = NULL; 1949 B->spptr = NULL; 1950 b->saved_values = NULL; 1951 b->idiag = NULL; 1952 b->mdiag = NULL; 1953 b->ssor_work = NULL; 1954 b->omega = 1.0; 1955 b->fshift = 0.0; 1956 b->idiagvalid = PETSC_FALSE; 1957 b->keepnonzeropattern = PETSC_FALSE; 1958 1959 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL)); 1960 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL)); 1961 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL)); 1962 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL)); 1963 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL)); 1964 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL)); 1965 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ)); 1966 PetscFunctionReturn(0); 1967 } 1968 1969 /* 1970 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1971 */ 1972 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace) 1973 { 1974 Mat_SeqSELL *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data; 1975 PetscInt i,m=A->rmap->n; 1976 PetscInt totalslices=a->totalslices; 1977 1978 PetscFunctionBegin; 1979 C->factortype = A->factortype; 1980 c->row = NULL; 1981 c->col = NULL; 1982 c->icol = NULL; 1983 c->reallocs = 0; 1984 C->assembled = PETSC_TRUE; 1985 1986 PetscCall(PetscLayoutReference(A->rmap,&C->rmap)); 1987 PetscCall(PetscLayoutReference(A->cmap,&C->cmap)); 1988 1989 PetscCall(PetscMalloc1(8*totalslices,&c->rlen)); 1990 PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt))); 1991 PetscCall(PetscMalloc1(totalslices+1,&c->sliidx)); 1992 PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt))); 1993 1994 for (i=0; i<m; i++) c->rlen[i] = a->rlen[i]; 1995 for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i]; 1996 1997 /* allocate the matrix space */ 1998 if (mallocmatspace) { 1999 PetscCall(PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx)); 2000 PetscCall(PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)))); 2001 2002 c->singlemalloc = PETSC_TRUE; 2003 2004 if (m > 0) { 2005 PetscCall(PetscArraycpy(c->colidx,a->colidx,a->maxallocmat)); 2006 if (cpvalues == MAT_COPY_VALUES) { 2007 PetscCall(PetscArraycpy(c->val,a->val,a->maxallocmat)); 2008 } else { 2009 PetscCall(PetscArrayzero(c->val,a->maxallocmat)); 2010 } 2011 } 2012 } 2013 2014 c->ignorezeroentries = a->ignorezeroentries; 2015 c->roworiented = a->roworiented; 2016 c->nonew = a->nonew; 2017 if (a->diag) { 2018 PetscCall(PetscMalloc1(m,&c->diag)); 2019 PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt))); 2020 for (i=0; i<m; i++) { 2021 c->diag[i] = a->diag[i]; 2022 } 2023 } else c->diag = NULL; 2024 2025 c->solve_work = NULL; 2026 c->saved_values = NULL; 2027 c->idiag = NULL; 2028 c->ssor_work = NULL; 2029 c->keepnonzeropattern = a->keepnonzeropattern; 2030 c->free_val = PETSC_TRUE; 2031 c->free_colidx = PETSC_TRUE; 2032 2033 c->maxallocmat = a->maxallocmat; 2034 c->maxallocrow = a->maxallocrow; 2035 c->rlenmax = a->rlenmax; 2036 c->nz = a->nz; 2037 C->preallocated = PETSC_TRUE; 2038 2039 c->nonzerorowcnt = a->nonzerorowcnt; 2040 C->nonzerostate = A->nonzerostate; 2041 2042 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist)); 2043 PetscFunctionReturn(0); 2044 } 2045 2046 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B) 2047 { 2048 PetscFunctionBegin; 2049 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 2050 PetscCall(MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n)); 2051 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) { 2052 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 2053 } 2054 PetscCall(MatSetType(*B,((PetscObject)A)->type_name)); 2055 PetscCall(MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE)); 2056 PetscFunctionReturn(0); 2057 } 2058 2059 /*MC 2060 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2061 based on the sliced Ellpack format 2062 2063 Options Database Keys: 2064 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions() 2065 2066 Level: beginner 2067 2068 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2069 M*/ 2070 2071 /*MC 2072 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2073 2074 This matrix type is identical to MATSEQSELL when constructed with a single process communicator, 2075 and MATMPISELL otherwise. As a result, for single process communicators, 2076 MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported 2077 for communicators controlling multiple processes. It is recommended that you call both of 2078 the above preallocation routines for simplicity. 2079 2080 Options Database Keys: 2081 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2082 2083 Level: beginner 2084 2085 Notes: 2086 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2087 2088 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2089 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2090 2091 Developer Notes: 2092 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2093 2094 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2095 .vb 2096 (2 0 3 4) 2097 Consider the matrix A = (5 0 6 0) 2098 (0 0 7 8) 2099 (0 0 9 9) 2100 2101 symbolically the Ellpack format can be written as 2102 2103 (2 3 4 |) (0 2 3 |) 2104 v = (5 6 0 |) colidx = (0 2 2 |) 2105 -------- --------- 2106 (7 8 |) (2 3 |) 2107 (9 9 |) (2 3 |) 2108 2109 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2110 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2111 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2112 2113 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2114 2115 .ve 2116 2117 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2118 2119 References: 2120 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2121 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2122 2123 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2124 M*/ 2125 2126 /*@C 2127 MatCreateSeqSELL - Creates a sparse matrix in SELL format. 2128 2129 Collective on comm 2130 2131 Input Parameters: 2132 + comm - MPI communicator, set to PETSC_COMM_SELF 2133 . m - number of rows 2134 . n - number of columns 2135 . rlenmax - maximum number of nonzeros in a row 2136 - rlen - array containing the number of nonzeros in the various rows 2137 (possibly different for each row) or NULL 2138 2139 Output Parameter: 2140 . A - the matrix 2141 2142 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2143 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2144 [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation] 2145 2146 Notes: 2147 If nnz is given then nz is ignored 2148 2149 Specify the preallocated storage with either rlenmax or rlen (not both). 2150 Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory 2151 allocation. For large problems you MUST preallocate memory or you 2152 will get TERRIBLE performance, see the users' manual chapter on matrices. 2153 2154 Level: intermediate 2155 2156 .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL` 2157 2158 @*/ 2159 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A) 2160 { 2161 PetscFunctionBegin; 2162 PetscCall(MatCreate(comm,A)); 2163 PetscCall(MatSetSizes(*A,m,n,m,n)); 2164 PetscCall(MatSetType(*A,MATSEQSELL)); 2165 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen)); 2166 PetscFunctionReturn(0); 2167 } 2168 2169 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg) 2170 { 2171 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data; 2172 PetscInt totalslices=a->totalslices; 2173 2174 PetscFunctionBegin; 2175 /* If the matrix dimensions are not equal,or no of nonzeros */ 2176 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2177 *flg = PETSC_FALSE; 2178 PetscFunctionReturn(0); 2179 } 2180 /* if the a->colidx are the same */ 2181 PetscCall(PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg)); 2182 if (!*flg) PetscFunctionReturn(0); 2183 /* if a->val are the same */ 2184 PetscCall(PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg)); 2185 PetscFunctionReturn(0); 2186 } 2187 2188 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2189 { 2190 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2191 2192 PetscFunctionBegin; 2193 a->idiagvalid = PETSC_FALSE; 2194 PetscFunctionReturn(0); 2195 } 2196 2197 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2198 { 2199 #if defined(PETSC_USE_COMPLEX) 2200 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2201 PetscInt i; 2202 PetscScalar *val = a->val; 2203 2204 PetscFunctionBegin; 2205 for (i=0; i<a->sliidx[a->totalslices]; i++) { 2206 val[i] = PetscConj(val[i]); 2207 } 2208 #else 2209 PetscFunctionBegin; 2210 #endif 2211 PetscFunctionReturn(0); 2212 } 2213