1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 9 static PetscBool cited = PETSC_FALSE; 10 static const char citation[] = 11 "@inproceedings{ZhangELLPACK2018,\n" 12 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 13 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 14 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 15 " year = 2018\n" 16 "}\n"; 17 18 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 19 20 #include <immintrin.h> 21 22 #if !defined(_MM_SCALE_8) 23 #define _MM_SCALE_8 8 24 #endif 25 26 #if defined(__AVX512F__) 27 /* these do not work 28 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 29 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 30 */ 31 #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 32 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 33 vec_idx = _mm256_loadu_si256((__m256i const*)acolidx); \ 34 vec_vals = _mm512_loadu_pd(aval); \ 35 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \ 36 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y) 37 #elif defined(__AVX2__) && defined(__FMA__) 38 #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 39 vec_vals = _mm256_loadu_pd(aval); \ 40 vec_idx = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \ 41 vec_x = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \ 42 vec_y = _mm256_fmadd_pd(vec_x,vec_vals,vec_y) 43 #endif 44 #endif /* PETSC_HAVE_IMMINTRIN_H */ 45 46 /*@C 47 MatSeqSELLSetPreallocation - For good matrix assembly performance 48 the user should preallocate the matrix storage by setting the parameter nz 49 (or the array nnz). By setting these parameters accurately, performance 50 during matrix assembly can be increased significantly. 51 52 Collective 53 54 Input Parameters: 55 + B - The matrix 56 . nz - number of nonzeros per row (same for all rows) 57 - nnz - array containing the number of nonzeros in the various rows 58 (possibly different for each row) or NULL 59 60 Notes: 61 If nnz is given then nz is ignored. 62 63 Specify the preallocated storage with either nz or nnz (not both). 64 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 65 allocation. For large problems you MUST preallocate memory or you 66 will get TERRIBLE performance, see the users' manual chapter on matrices. 67 68 You can call MatGetInfo() to get information on how effective the preallocation was; 69 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 70 You can also run with the option -info and look for messages with the string 71 malloc in them to see if additional memory allocation was needed. 72 73 Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix 74 entries or columns indices. 75 76 The maximum number of nonzeos in any row should be as accurate as possible. 77 If it is underestimated, you will get bad performance due to reallocation 78 (MatSeqXSELLReallocateSELL). 79 80 Level: intermediate 81 82 .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo() 83 84 @*/ 85 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[]) 86 { 87 PetscFunctionBegin; 88 PetscValidHeaderSpecific(B,MAT_CLASSID,1); 89 PetscValidType(B,1); 90 PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen)); 91 PetscFunctionReturn(0); 92 } 93 94 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[]) 95 { 96 Mat_SeqSELL *b; 97 PetscInt i,j,totalslices; 98 PetscBool skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE; 99 100 PetscFunctionBegin; 101 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 102 if (maxallocrow == MAT_SKIP_ALLOCATION) { 103 skipallocation = PETSC_TRUE; 104 maxallocrow = 0; 105 } 106 107 PetscCall(PetscLayoutSetUp(B->rmap)); 108 PetscCall(PetscLayoutSetUp(B->cmap)); 109 110 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 111 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 112 PetscCheckFalse(maxallocrow < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %" PetscInt_FMT,maxallocrow); 113 if (rlen) { 114 for (i=0; i<B->rmap->n; i++) { 115 PetscCheckFalse(rlen[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,rlen[i]); 116 PetscCheckFalse(rlen[i] > B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT,i,rlen[i],B->cmap->n); 117 } 118 } 119 120 B->preallocated = PETSC_TRUE; 121 122 b = (Mat_SeqSELL*)B->data; 123 124 totalslices = PetscCeilInt(B->rmap->n,8); 125 b->totalslices = totalslices; 126 if (!skipallocation) { 127 if (B->rmap->n & 0x07) PetscCall(PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n)); 128 129 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 130 PetscCall(PetscMalloc1(totalslices+1,&b->sliidx)); 131 PetscCall(PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt))); 132 } 133 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 134 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 135 else if (maxallocrow < 0) maxallocrow = 1; 136 for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow; 137 } else { 138 maxallocrow = 0; 139 b->sliidx[0] = 0; 140 for (i=1; i<totalslices; i++) { 141 b->sliidx[i] = 0; 142 for (j=0;j<8;j++) { 143 b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]); 144 } 145 maxallocrow = PetscMax(b->sliidx[i],maxallocrow); 146 PetscCall(PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i])); 147 } 148 /* last slice */ 149 b->sliidx[totalslices] = 0; 150 for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]); 151 maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow); 152 b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices]; 153 } 154 155 /* allocate space for val, colidx, rlen */ 156 /* FIXME: should B's old memory be unlogged? */ 157 PetscCall(MatSeqXSELLFreeSELL(B,&b->val,&b->colidx)); 158 /* FIXME: assuming an element of the bit array takes 8 bits */ 159 PetscCall(PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx)); 160 PetscCall(PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)))); 161 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 162 PetscCall(PetscCalloc1(8*totalslices,&b->rlen)); 163 PetscCall(PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt))); 164 165 b->singlemalloc = PETSC_TRUE; 166 b->free_val = PETSC_TRUE; 167 b->free_colidx = PETSC_TRUE; 168 } else { 169 b->free_val = PETSC_FALSE; 170 b->free_colidx = PETSC_FALSE; 171 } 172 173 b->nz = 0; 174 b->maxallocrow = maxallocrow; 175 b->rlenmax = maxallocrow; 176 b->maxallocmat = b->sliidx[totalslices]; 177 B->info.nz_unneeded = (double)b->maxallocmat; 178 if (realalloc) { 179 PetscCall(MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 180 } 181 PetscFunctionReturn(0); 182 } 183 184 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 185 { 186 Mat_SeqSELL *a = (Mat_SeqSELL*)A->data; 187 PetscInt shift; 188 189 PetscFunctionBegin; 190 PetscCheckFalse(row < 0 || row >= A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %" PetscInt_FMT " out of range",row); 191 if (nz) *nz = a->rlen[row]; 192 shift = a->sliidx[row>>3]+(row&0x07); 193 if (!a->getrowcols) { 194 195 PetscCall(PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals)); 196 } 197 if (idx) { 198 PetscInt j; 199 for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j]; 200 *idx = a->getrowcols; 201 } 202 if (v) { 203 PetscInt j; 204 for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j]; 205 *v = a->getrowvals; 206 } 207 PetscFunctionReturn(0); 208 } 209 210 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 211 { 212 PetscFunctionBegin; 213 PetscFunctionReturn(0); 214 } 215 216 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat) 217 { 218 Mat B; 219 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 220 PetscInt i; 221 222 PetscFunctionBegin; 223 if (reuse == MAT_REUSE_MATRIX) { 224 B = *newmat; 225 PetscCall(MatZeroEntries(B)); 226 } else { 227 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B)); 228 PetscCall(MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N)); 229 PetscCall(MatSetType(B,MATSEQAIJ)); 230 PetscCall(MatSeqAIJSetPreallocation(B,0,a->rlen)); 231 } 232 233 for (i=0; i<A->rmap->n; i++) { 234 PetscInt nz = 0,*cols = NULL; 235 PetscScalar *vals = NULL; 236 237 PetscCall(MatGetRow_SeqSELL(A,i,&nz,&cols,&vals)); 238 PetscCall(MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES)); 239 PetscCall(MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals)); 240 } 241 242 PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 243 PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 244 B->rmap->bs = A->rmap->bs; 245 246 if (reuse == MAT_INPLACE_MATRIX) { 247 PetscCall(MatHeaderReplace(A,&B)); 248 } else { 249 *newmat = B; 250 } 251 PetscFunctionReturn(0); 252 } 253 254 #include <../src/mat/impls/aij/seq/aij.h> 255 256 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat) 257 { 258 Mat B; 259 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 260 PetscInt *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols; 261 const PetscInt *cols; 262 const PetscScalar *vals; 263 264 PetscFunctionBegin; 265 266 if (reuse == MAT_REUSE_MATRIX) { 267 B = *newmat; 268 } else { 269 if (PetscDefined(USE_DEBUG) || !a->ilen) { 270 PetscCall(PetscMalloc1(m,&rowlengths)); 271 for (i=0; i<m; i++) { 272 rowlengths[i] = ai[i+1] - ai[i]; 273 } 274 } 275 if (PetscDefined(USE_DEBUG) && a->ilen) { 276 PetscBool eq; 277 PetscCall(PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq)); 278 PetscCheck(eq,PETSC_COMM_SELF,PETSC_ERR_PLIB,"SeqAIJ ilen array incorrect"); 279 PetscCall(PetscFree(rowlengths)); 280 rowlengths = a->ilen; 281 } else if (a->ilen) rowlengths = a->ilen; 282 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B)); 283 PetscCall(MatSetSizes(B,m,n,m,n)); 284 PetscCall(MatSetType(B,MATSEQSELL)); 285 PetscCall(MatSeqSELLSetPreallocation(B,0,rowlengths)); 286 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 287 } 288 289 for (row=0; row<m; row++) { 290 PetscCall(MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals)); 291 PetscCall(MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES)); 292 PetscCall(MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals)); 293 } 294 PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 295 PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 296 B->rmap->bs = A->rmap->bs; 297 298 if (reuse == MAT_INPLACE_MATRIX) { 299 PetscCall(MatHeaderReplace(A,&B)); 300 } else { 301 *newmat = B; 302 } 303 PetscFunctionReturn(0); 304 } 305 306 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy) 307 { 308 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 309 PetscScalar *y; 310 const PetscScalar *x; 311 const MatScalar *aval=a->val; 312 PetscInt totalslices=a->totalslices; 313 const PetscInt *acolidx=a->colidx; 314 PetscInt i,j; 315 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 316 __m512d vec_x,vec_y,vec_vals; 317 __m256i vec_idx; 318 __mmask8 mask; 319 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 320 __m256i vec_idx2,vec_idx3,vec_idx4; 321 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 322 __m128i vec_idx; 323 __m256d vec_x,vec_y,vec_y2,vec_vals; 324 MatScalar yval; 325 PetscInt r,rows_left,row,nnz_in_row; 326 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 327 __m128d vec_x_tmp; 328 __m256d vec_x,vec_y,vec_y2,vec_vals; 329 MatScalar yval; 330 PetscInt r,rows_left,row,nnz_in_row; 331 #else 332 PetscScalar sum[8]; 333 #endif 334 335 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 336 #pragma disjoint(*x,*y,*aval) 337 #endif 338 339 PetscFunctionBegin; 340 PetscCall(VecGetArrayRead(xx,&x)); 341 PetscCall(VecGetArray(yy,&y)); 342 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 343 for (i=0; i<totalslices; i++) { /* loop over slices */ 344 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 345 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 346 347 vec_y = _mm512_setzero_pd(); 348 vec_y2 = _mm512_setzero_pd(); 349 vec_y3 = _mm512_setzero_pd(); 350 vec_y4 = _mm512_setzero_pd(); 351 352 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 353 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 354 case 3: 355 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 356 acolidx += 8; aval += 8; 357 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 358 acolidx += 8; aval += 8; 359 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 360 acolidx += 8; aval += 8; 361 j += 3; 362 break; 363 case 2: 364 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 365 acolidx += 8; aval += 8; 366 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 367 acolidx += 8; aval += 8; 368 j += 2; 369 break; 370 case 1: 371 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 372 acolidx += 8; aval += 8; 373 j += 1; 374 break; 375 } 376 #pragma novector 377 for (; j<(a->sliidx[i+1]>>3); j+=4) { 378 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 379 acolidx += 8; aval += 8; 380 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 381 acolidx += 8; aval += 8; 382 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 383 acolidx += 8; aval += 8; 384 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 385 acolidx += 8; aval += 8; 386 } 387 388 vec_y = _mm512_add_pd(vec_y,vec_y2); 389 vec_y = _mm512_add_pd(vec_y,vec_y3); 390 vec_y = _mm512_add_pd(vec_y,vec_y4); 391 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 392 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 393 _mm512_mask_storeu_pd(&y[8*i],mask,vec_y); 394 } else { 395 _mm512_storeu_pd(&y[8*i],vec_y); 396 } 397 } 398 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 399 for (i=0; i<totalslices; i++) { /* loop over full slices */ 400 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 401 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 402 403 /* last slice may have padding rows. Don't use vectorization. */ 404 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 405 rows_left = A->rmap->n - 8*i; 406 for (r=0; r<rows_left; ++r) { 407 yval = (MatScalar)0; 408 row = 8*i + r; 409 nnz_in_row = a->rlen[row]; 410 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 411 y[row] = yval; 412 } 413 break; 414 } 415 416 vec_y = _mm256_setzero_pd(); 417 vec_y2 = _mm256_setzero_pd(); 418 419 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 420 #pragma novector 421 #pragma unroll(2) 422 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 423 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 424 aval += 4; acolidx += 4; 425 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2); 426 aval += 4; acolidx += 4; 427 } 428 429 _mm256_storeu_pd(y+i*8,vec_y); 430 _mm256_storeu_pd(y+i*8+4,vec_y2); 431 } 432 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 433 for (i=0; i<totalslices; i++) { /* loop over full slices */ 434 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 435 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 436 437 vec_y = _mm256_setzero_pd(); 438 vec_y2 = _mm256_setzero_pd(); 439 440 /* last slice may have padding rows. Don't use vectorization. */ 441 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 442 rows_left = A->rmap->n - 8*i; 443 for (r=0; r<rows_left; ++r) { 444 yval = (MatScalar)0; 445 row = 8*i + r; 446 nnz_in_row = a->rlen[row]; 447 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]]; 448 y[row] = yval; 449 } 450 break; 451 } 452 453 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 454 #pragma novector 455 #pragma unroll(2) 456 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 457 vec_vals = _mm256_loadu_pd(aval); 458 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 459 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 460 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 461 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 462 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 463 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 464 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 465 aval += 4; 466 467 vec_vals = _mm256_loadu_pd(aval); 468 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 469 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 470 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 471 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 472 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 473 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 474 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 475 aval += 4; 476 } 477 478 _mm256_storeu_pd(y + i*8, vec_y); 479 _mm256_storeu_pd(y + i*8 + 4, vec_y2); 480 } 481 #else 482 for (i=0; i<totalslices; i++) { /* loop over slices */ 483 for (j=0; j<8; j++) sum[j] = 0.0; 484 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 485 sum[0] += aval[j] * x[acolidx[j]]; 486 sum[1] += aval[j+1] * x[acolidx[j+1]]; 487 sum[2] += aval[j+2] * x[acolidx[j+2]]; 488 sum[3] += aval[j+3] * x[acolidx[j+3]]; 489 sum[4] += aval[j+4] * x[acolidx[j+4]]; 490 sum[5] += aval[j+5] * x[acolidx[j+5]]; 491 sum[6] += aval[j+6] * x[acolidx[j+6]]; 492 sum[7] += aval[j+7] * x[acolidx[j+7]]; 493 } 494 if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 495 for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j]; 496 } else { 497 for (j=0; j<8; j++) y[8*i+j] = sum[j]; 498 } 499 } 500 #endif 501 502 PetscCall(PetscLogFlops(2.0*a->nz-a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 503 PetscCall(VecRestoreArrayRead(xx,&x)); 504 PetscCall(VecRestoreArray(yy,&y)); 505 PetscFunctionReturn(0); 506 } 507 508 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 509 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz) 510 { 511 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 512 PetscScalar *y,*z; 513 const PetscScalar *x; 514 const MatScalar *aval=a->val; 515 PetscInt totalslices=a->totalslices; 516 const PetscInt *acolidx=a->colidx; 517 PetscInt i,j; 518 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 519 __m512d vec_x,vec_y,vec_vals; 520 __m256i vec_idx; 521 __mmask8 mask; 522 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 523 __m256i vec_idx2,vec_idx3,vec_idx4; 524 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 525 __m128d vec_x_tmp; 526 __m256d vec_x,vec_y,vec_y2,vec_vals; 527 MatScalar yval; 528 PetscInt r,row,nnz_in_row; 529 #else 530 PetscScalar sum[8]; 531 #endif 532 533 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 534 #pragma disjoint(*x,*y,*aval) 535 #endif 536 537 PetscFunctionBegin; 538 PetscCall(VecGetArrayRead(xx,&x)); 539 PetscCall(VecGetArrayPair(yy,zz,&y,&z)); 540 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 541 for (i=0; i<totalslices; i++) { /* loop over slices */ 542 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 543 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 544 545 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 546 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 547 vec_y = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]); 548 } else { 549 vec_y = _mm512_loadu_pd(&y[8*i]); 550 } 551 vec_y2 = _mm512_setzero_pd(); 552 vec_y3 = _mm512_setzero_pd(); 553 vec_y4 = _mm512_setzero_pd(); 554 555 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 556 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 557 case 3: 558 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 559 acolidx += 8; aval += 8; 560 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 561 acolidx += 8; aval += 8; 562 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 563 acolidx += 8; aval += 8; 564 j += 3; 565 break; 566 case 2: 567 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 568 acolidx += 8; aval += 8; 569 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 570 acolidx += 8; aval += 8; 571 j += 2; 572 break; 573 case 1: 574 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 575 acolidx += 8; aval += 8; 576 j += 1; 577 break; 578 } 579 #pragma novector 580 for (; j<(a->sliidx[i+1]>>3); j+=4) { 581 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 582 acolidx += 8; aval += 8; 583 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 584 acolidx += 8; aval += 8; 585 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 586 acolidx += 8; aval += 8; 587 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 588 acolidx += 8; aval += 8; 589 } 590 591 vec_y = _mm512_add_pd(vec_y,vec_y2); 592 vec_y = _mm512_add_pd(vec_y,vec_y3); 593 vec_y = _mm512_add_pd(vec_y,vec_y4); 594 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 595 _mm512_mask_storeu_pd(&z[8*i],mask,vec_y); 596 } else { 597 _mm512_storeu_pd(&z[8*i],vec_y); 598 } 599 } 600 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 601 for (i=0; i<totalslices; i++) { /* loop over full slices */ 602 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 603 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 604 605 /* last slice may have padding rows. Don't use vectorization. */ 606 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 607 for (r=0; r<(A->rmap->n & 0x07); ++r) { 608 row = 8*i + r; 609 yval = (MatScalar)0.0; 610 nnz_in_row = a->rlen[row]; 611 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 612 z[row] = y[row] + yval; 613 } 614 break; 615 } 616 617 vec_y = _mm256_loadu_pd(y+8*i); 618 vec_y2 = _mm256_loadu_pd(y+8*i+4); 619 620 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 621 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 622 vec_vals = _mm256_loadu_pd(aval); 623 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 624 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 625 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 626 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 627 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 628 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 629 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 630 aval += 4; 631 632 vec_vals = _mm256_loadu_pd(aval); 633 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 634 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 635 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 636 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 637 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 638 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 639 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 640 aval += 4; 641 } 642 643 _mm256_storeu_pd(z+i*8,vec_y); 644 _mm256_storeu_pd(z+i*8+4,vec_y2); 645 } 646 #else 647 for (i=0; i<totalslices; i++) { /* loop over slices */ 648 for (j=0; j<8; j++) sum[j] = 0.0; 649 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 650 sum[0] += aval[j] * x[acolidx[j]]; 651 sum[1] += aval[j+1] * x[acolidx[j+1]]; 652 sum[2] += aval[j+2] * x[acolidx[j+2]]; 653 sum[3] += aval[j+3] * x[acolidx[j+3]]; 654 sum[4] += aval[j+4] * x[acolidx[j+4]]; 655 sum[5] += aval[j+5] * x[acolidx[j+5]]; 656 sum[6] += aval[j+6] * x[acolidx[j+6]]; 657 sum[7] += aval[j+7] * x[acolidx[j+7]]; 658 } 659 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 660 for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j]; 661 } else { 662 for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j]; 663 } 664 } 665 #endif 666 667 PetscCall(PetscLogFlops(2.0*a->nz)); 668 PetscCall(VecRestoreArrayRead(xx,&x)); 669 PetscCall(VecRestoreArrayPair(yy,zz,&y,&z)); 670 PetscFunctionReturn(0); 671 } 672 673 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy) 674 { 675 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 676 PetscScalar *y; 677 const PetscScalar *x; 678 const MatScalar *aval=a->val; 679 const PetscInt *acolidx=a->colidx; 680 PetscInt i,j,r,row,nnz_in_row,totalslices=a->totalslices; 681 682 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 683 #pragma disjoint(*x,*y,*aval) 684 #endif 685 686 PetscFunctionBegin; 687 if (A->symmetric) { 688 PetscCall(MatMultAdd_SeqSELL(A,xx,zz,yy)); 689 PetscFunctionReturn(0); 690 } 691 if (zz != yy) PetscCall(VecCopy(zz,yy)); 692 PetscCall(VecGetArrayRead(xx,&x)); 693 PetscCall(VecGetArray(yy,&y)); 694 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 695 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 696 for (r=0; r<(A->rmap->n & 0x07); ++r) { 697 row = 8*i + r; 698 nnz_in_row = a->rlen[row]; 699 for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row]; 700 } 701 break; 702 } 703 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 704 y[acolidx[j]] += aval[j] * x[8*i]; 705 y[acolidx[j+1]] += aval[j+1] * x[8*i+1]; 706 y[acolidx[j+2]] += aval[j+2] * x[8*i+2]; 707 y[acolidx[j+3]] += aval[j+3] * x[8*i+3]; 708 y[acolidx[j+4]] += aval[j+4] * x[8*i+4]; 709 y[acolidx[j+5]] += aval[j+5] * x[8*i+5]; 710 y[acolidx[j+6]] += aval[j+6] * x[8*i+6]; 711 y[acolidx[j+7]] += aval[j+7] * x[8*i+7]; 712 } 713 } 714 PetscCall(PetscLogFlops(2.0*a->sliidx[a->totalslices])); 715 PetscCall(VecRestoreArrayRead(xx,&x)); 716 PetscCall(VecRestoreArray(yy,&y)); 717 PetscFunctionReturn(0); 718 } 719 720 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy) 721 { 722 PetscFunctionBegin; 723 if (A->symmetric) { 724 PetscCall(MatMult_SeqSELL(A,xx,yy)); 725 } else { 726 PetscCall(VecSet(yy,0.0)); 727 PetscCall(MatMultTransposeAdd_SeqSELL(A,xx,yy,yy)); 728 } 729 PetscFunctionReturn(0); 730 } 731 732 /* 733 Checks for missing diagonals 734 */ 735 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool *missing,PetscInt *d) 736 { 737 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 738 PetscInt *diag,i; 739 740 PetscFunctionBegin; 741 *missing = PETSC_FALSE; 742 if (A->rmap->n > 0 && !(a->colidx)) { 743 *missing = PETSC_TRUE; 744 if (d) *d = 0; 745 PetscCall(PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n")); 746 } else { 747 diag = a->diag; 748 for (i=0; i<A->rmap->n; i++) { 749 if (diag[i] == -1) { 750 *missing = PETSC_TRUE; 751 if (d) *d = i; 752 PetscCall(PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i)); 753 break; 754 } 755 } 756 } 757 PetscFunctionReturn(0); 758 } 759 760 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 761 { 762 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 763 PetscInt i,j,m=A->rmap->n,shift; 764 765 PetscFunctionBegin; 766 if (!a->diag) { 767 PetscCall(PetscMalloc1(m,&a->diag)); 768 PetscCall(PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt))); 769 a->free_diag = PETSC_TRUE; 770 } 771 for (i=0; i<m; i++) { /* loop over rows */ 772 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 773 a->diag[i] = -1; 774 for (j=0; j<a->rlen[i]; j++) { 775 if (a->colidx[shift+j*8] == i) { 776 a->diag[i] = shift+j*8; 777 break; 778 } 779 } 780 } 781 PetscFunctionReturn(0); 782 } 783 784 /* 785 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 786 */ 787 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift) 788 { 789 Mat_SeqSELL *a=(Mat_SeqSELL*) A->data; 790 PetscInt i,*diag,m = A->rmap->n; 791 MatScalar *val = a->val; 792 PetscScalar *idiag,*mdiag; 793 794 PetscFunctionBegin; 795 if (a->idiagvalid) PetscFunctionReturn(0); 796 PetscCall(MatMarkDiagonal_SeqSELL(A)); 797 diag = a->diag; 798 if (!a->idiag) { 799 PetscCall(PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work)); 800 PetscCall(PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar))); 801 val = a->val; 802 } 803 mdiag = a->mdiag; 804 idiag = a->idiag; 805 806 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 807 for (i=0; i<m; i++) { 808 mdiag[i] = val[diag[i]]; 809 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 810 if (PetscRealPart(fshift)) { 811 PetscCall(PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i)); 812 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 813 A->factorerror_zeropivot_value = 0.0; 814 A->factorerror_zeropivot_row = i; 815 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i); 816 } 817 idiag[i] = 1.0/val[diag[i]]; 818 } 819 PetscCall(PetscLogFlops(m)); 820 } else { 821 for (i=0; i<m; i++) { 822 mdiag[i] = val[diag[i]]; 823 idiag[i] = omega/(fshift + val[diag[i]]); 824 } 825 PetscCall(PetscLogFlops(2.0*m)); 826 } 827 a->idiagvalid = PETSC_TRUE; 828 PetscFunctionReturn(0); 829 } 830 831 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 832 { 833 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 834 835 PetscFunctionBegin; 836 PetscCall(PetscArrayzero(a->val,a->sliidx[a->totalslices])); 837 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 838 PetscFunctionReturn(0); 839 } 840 841 PetscErrorCode MatDestroy_SeqSELL(Mat A) 842 { 843 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 844 845 PetscFunctionBegin; 846 #if defined(PETSC_USE_LOG) 847 PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz); 848 #endif 849 PetscCall(MatSeqXSELLFreeSELL(A,&a->val,&a->colidx)); 850 PetscCall(ISDestroy(&a->row)); 851 PetscCall(ISDestroy(&a->col)); 852 PetscCall(PetscFree(a->diag)); 853 PetscCall(PetscFree(a->rlen)); 854 PetscCall(PetscFree(a->sliidx)); 855 PetscCall(PetscFree3(a->idiag,a->mdiag,a->ssor_work)); 856 PetscCall(PetscFree(a->solve_work)); 857 PetscCall(ISDestroy(&a->icol)); 858 PetscCall(PetscFree(a->saved_values)); 859 PetscCall(PetscFree2(a->getrowcols,a->getrowvals)); 860 861 PetscCall(PetscFree(A->data)); 862 863 PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL)); 864 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL)); 865 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL)); 866 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL)); 867 PetscFunctionReturn(0); 868 } 869 870 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg) 871 { 872 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 873 874 PetscFunctionBegin; 875 switch (op) { 876 case MAT_ROW_ORIENTED: 877 a->roworiented = flg; 878 break; 879 case MAT_KEEP_NONZERO_PATTERN: 880 a->keepnonzeropattern = flg; 881 break; 882 case MAT_NEW_NONZERO_LOCATIONS: 883 a->nonew = (flg ? 0 : 1); 884 break; 885 case MAT_NEW_NONZERO_LOCATION_ERR: 886 a->nonew = (flg ? -1 : 0); 887 break; 888 case MAT_NEW_NONZERO_ALLOCATION_ERR: 889 a->nonew = (flg ? -2 : 0); 890 break; 891 case MAT_UNUSED_NONZERO_LOCATION_ERR: 892 a->nounused = (flg ? -1 : 0); 893 break; 894 case MAT_FORCE_DIAGONAL_ENTRIES: 895 case MAT_IGNORE_OFF_PROC_ENTRIES: 896 case MAT_USE_HASH_TABLE: 897 case MAT_SORTED_FULL: 898 PetscCall(PetscInfo(A,"Option %s ignored\n",MatOptions[op])); 899 break; 900 case MAT_SPD: 901 case MAT_SYMMETRIC: 902 case MAT_STRUCTURALLY_SYMMETRIC: 903 case MAT_HERMITIAN: 904 case MAT_SYMMETRY_ETERNAL: 905 /* These options are handled directly by MatSetOption() */ 906 break; 907 default: 908 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op); 909 } 910 PetscFunctionReturn(0); 911 } 912 913 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v) 914 { 915 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 916 PetscInt i,j,n,shift; 917 PetscScalar *x,zero=0.0; 918 919 PetscFunctionBegin; 920 PetscCall(VecGetLocalSize(v,&n)); 921 PetscCheckFalse(n != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 922 923 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 924 PetscInt *diag=a->diag; 925 PetscCall(VecGetArray(v,&x)); 926 for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]]; 927 PetscCall(VecRestoreArray(v,&x)); 928 PetscFunctionReturn(0); 929 } 930 931 PetscCall(VecSet(v,zero)); 932 PetscCall(VecGetArray(v,&x)); 933 for (i=0; i<n; i++) { /* loop over rows */ 934 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 935 x[i] = 0; 936 for (j=0; j<a->rlen[i]; j++) { 937 if (a->colidx[shift+j*8] == i) { 938 x[i] = a->val[shift+j*8]; 939 break; 940 } 941 } 942 } 943 PetscCall(VecRestoreArray(v,&x)); 944 PetscFunctionReturn(0); 945 } 946 947 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr) 948 { 949 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 950 const PetscScalar *l,*r; 951 PetscInt i,j,m,n,row; 952 953 PetscFunctionBegin; 954 if (ll) { 955 /* The local size is used so that VecMPI can be passed to this routine 956 by MatDiagonalScale_MPISELL */ 957 PetscCall(VecGetLocalSize(ll,&m)); 958 PetscCheckFalse(m != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 959 PetscCall(VecGetArrayRead(ll,&l)); 960 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 961 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 962 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 963 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row]; 964 } 965 } else { 966 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 967 a->val[j] *= l[8*i+row]; 968 } 969 } 970 } 971 PetscCall(VecRestoreArrayRead(ll,&l)); 972 PetscCall(PetscLogFlops(a->nz)); 973 } 974 if (rr) { 975 PetscCall(VecGetLocalSize(rr,&n)); 976 PetscCheckFalse(n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 977 PetscCall(VecGetArrayRead(rr,&r)); 978 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 979 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 980 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 981 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 982 } 983 } else { 984 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) { 985 a->val[j] *= r[a->colidx[j]]; 986 } 987 } 988 } 989 PetscCall(VecRestoreArrayRead(rr,&r)); 990 PetscCall(PetscLogFlops(a->nz)); 991 } 992 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 993 PetscFunctionReturn(0); 994 } 995 996 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[]) 997 { 998 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 999 PetscInt *cp,i,k,low,high,t,row,col,l; 1000 PetscInt shift; 1001 MatScalar *vp; 1002 1003 PetscFunctionBegin; 1004 for (k=0; k<m; k++) { /* loop over requested rows */ 1005 row = im[k]; 1006 if (row<0) continue; 1007 PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1); 1008 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1009 cp = a->colidx+shift; /* pointer to the row */ 1010 vp = a->val+shift; /* pointer to the row */ 1011 for (l=0; l<n; l++) { /* loop over requested columns */ 1012 col = in[l]; 1013 if (col<0) continue; 1014 PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1); 1015 high = a->rlen[row]; low = 0; /* assume unsorted */ 1016 while (high-low > 5) { 1017 t = (low+high)/2; 1018 if (*(cp+t*8) > col) high = t; 1019 else low = t; 1020 } 1021 for (i=low; i<high; i++) { 1022 if (*(cp+8*i) > col) break; 1023 if (*(cp+8*i) == col) { 1024 *v++ = *(vp+8*i); 1025 goto finished; 1026 } 1027 } 1028 *v++ = 0.0; 1029 finished:; 1030 } 1031 } 1032 PetscFunctionReturn(0); 1033 } 1034 1035 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer) 1036 { 1037 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1038 PetscInt i,j,m=A->rmap->n,shift; 1039 const char *name; 1040 PetscViewerFormat format; 1041 1042 PetscFunctionBegin; 1043 PetscCall(PetscViewerGetFormat(viewer,&format)); 1044 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1045 PetscInt nofinalvalue = 0; 1046 /* 1047 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1048 nofinalvalue = 1; 1049 } 1050 */ 1051 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1052 PetscCall(PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n)); 1053 PetscCall(PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz)); 1054 #if defined(PETSC_USE_COMPLEX) 1055 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue)); 1056 #else 1057 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue)); 1058 #endif 1059 PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = [\n")); 1060 1061 for (i=0; i<m; i++) { 1062 shift = a->sliidx[i>>3]+(i&0x07); 1063 for (j=0; j<a->rlen[i]; j++) { 1064 #if defined(PETSC_USE_COMPLEX) 1065 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1066 #else 1067 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j])); 1068 #endif 1069 } 1070 } 1071 /* 1072 if (nofinalvalue) { 1073 #if defined(PETSC_USE_COMPLEX) 1074 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1075 #else 1076 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1077 #endif 1078 } 1079 */ 1080 PetscCall(PetscObjectGetName((PetscObject)A,&name)); 1081 PetscCall(PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name)); 1082 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1083 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1084 PetscFunctionReturn(0); 1085 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1086 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1087 for (i=0; i<m; i++) { 1088 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1089 shift = a->sliidx[i>>3]+(i&0x07); 1090 for (j=0; j<a->rlen[i]; j++) { 1091 #if defined(PETSC_USE_COMPLEX) 1092 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1093 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1094 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1095 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]))); 1096 } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) { 1097 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]))); 1098 } 1099 #else 1100 if (a->val[shift+8*j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j])); 1101 #endif 1102 } 1103 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1104 } 1105 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1106 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1107 PetscInt cnt=0,jcnt; 1108 PetscScalar value; 1109 #if defined(PETSC_USE_COMPLEX) 1110 PetscBool realonly=PETSC_TRUE; 1111 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1112 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1113 realonly = PETSC_FALSE; 1114 break; 1115 } 1116 } 1117 #endif 1118 1119 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1120 for (i=0; i<m; i++) { 1121 jcnt = 0; 1122 shift = a->sliidx[i>>3]+(i&0x07); 1123 for (j=0; j<A->cmap->n; j++) { 1124 if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) { 1125 value = a->val[cnt++]; 1126 jcnt++; 1127 } else { 1128 value = 0.0; 1129 } 1130 #if defined(PETSC_USE_COMPLEX) 1131 if (realonly) { 1132 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value))); 1133 } else { 1134 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value))); 1135 } 1136 #else 1137 PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value)); 1138 #endif 1139 } 1140 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1141 } 1142 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1143 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1144 PetscInt fshift=1; 1145 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1146 #if defined(PETSC_USE_COMPLEX) 1147 PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n")); 1148 #else 1149 PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n")); 1150 #endif 1151 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1152 for (i=0; i<m; i++) { 1153 shift = a->sliidx[i>>3]+(i&0x07); 1154 for (j=0; j<a->rlen[i]; j++) { 1155 #if defined(PETSC_USE_COMPLEX) 1156 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1157 #else 1158 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j])); 1159 #endif 1160 } 1161 } 1162 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1163 } else if (format == PETSC_VIEWER_NATIVE) { 1164 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 1165 PetscInt row; 1166 PetscCall(PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1])); 1167 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 1168 #if defined(PETSC_USE_COMPLEX) 1169 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1170 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1171 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1172 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]))); 1173 } else { 1174 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]))); 1175 } 1176 #else 1177 PetscCall(PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j])); 1178 #endif 1179 } 1180 } 1181 } else { 1182 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE)); 1183 if (A->factortype) { 1184 for (i=0; i<m; i++) { 1185 shift = a->sliidx[i>>3]+(i&0x07); 1186 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1187 /* L part */ 1188 for (j=shift; j<a->diag[i]; j+=8) { 1189 #if defined(PETSC_USE_COMPLEX) 1190 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) { 1191 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1192 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) { 1193 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])))); 1194 } else { 1195 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]))); 1196 } 1197 #else 1198 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j])); 1199 #endif 1200 } 1201 /* diagonal */ 1202 j = a->diag[i]; 1203 #if defined(PETSC_USE_COMPLEX) 1204 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1205 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]))); 1206 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1207 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])))); 1208 } else { 1209 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]))); 1210 } 1211 #else 1212 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j]))); 1213 #endif 1214 1215 /* U part */ 1216 for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) { 1217 #if defined(PETSC_USE_COMPLEX) 1218 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1219 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]))); 1220 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1221 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])))); 1222 } else { 1223 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]))); 1224 } 1225 #else 1226 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j])); 1227 #endif 1228 } 1229 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1230 } 1231 } else { 1232 for (i=0; i<m; i++) { 1233 shift = a->sliidx[i>>3]+(i&0x07); 1234 PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i)); 1235 for (j=0; j<a->rlen[i]; j++) { 1236 #if defined(PETSC_USE_COMPLEX) 1237 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1238 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]))); 1239 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1240 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]))); 1241 } else { 1242 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]))); 1243 } 1244 #else 1245 PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j])); 1246 #endif 1247 } 1248 PetscCall(PetscViewerASCIIPrintf(viewer,"\n")); 1249 } 1250 } 1251 PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE)); 1252 } 1253 PetscCall(PetscViewerFlush(viewer)); 1254 PetscFunctionReturn(0); 1255 } 1256 1257 #include <petscdraw.h> 1258 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa) 1259 { 1260 Mat A=(Mat)Aa; 1261 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1262 PetscInt i,j,m=A->rmap->n,shift; 1263 int color; 1264 PetscReal xl,yl,xr,yr,x_l,x_r,y_l,y_r; 1265 PetscViewer viewer; 1266 PetscViewerFormat format; 1267 PetscErrorCode ierr; 1268 1269 PetscFunctionBegin; 1270 PetscCall(PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer)); 1271 PetscCall(PetscViewerGetFormat(viewer,&format)); 1272 PetscCall(PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr)); 1273 1274 /* loop over matrix elements drawing boxes */ 1275 1276 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1277 ierr = PetscDrawCollectiveBegin(draw);PetscCall(ierr); 1278 /* Blue for negative, Cyan for zero and Red for positive */ 1279 color = PETSC_DRAW_BLUE; 1280 for (i=0; i<m; i++) { 1281 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1282 y_l = m - i - 1.0; y_r = y_l + 1.0; 1283 for (j=0; j<a->rlen[i]; j++) { 1284 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1285 if (PetscRealPart(a->val[shift+8*j]) >= 0.) continue; 1286 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1287 } 1288 } 1289 color = PETSC_DRAW_CYAN; 1290 for (i=0; i<m; i++) { 1291 shift = a->sliidx[i>>3]+(i&0x07); 1292 y_l = m - i - 1.0; y_r = y_l + 1.0; 1293 for (j=0; j<a->rlen[i]; j++) { 1294 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1295 if (a->val[shift+8*j] != 0.) continue; 1296 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1297 } 1298 } 1299 color = PETSC_DRAW_RED; 1300 for (i=0; i<m; i++) { 1301 shift = a->sliidx[i>>3]+(i&0x07); 1302 y_l = m - i - 1.0; y_r = y_l + 1.0; 1303 for (j=0; j<a->rlen[i]; j++) { 1304 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1305 if (PetscRealPart(a->val[shift+8*j]) <= 0.) continue; 1306 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1307 } 1308 } 1309 ierr = PetscDrawCollectiveEnd(draw);PetscCall(ierr); 1310 } else { 1311 /* use contour shading to indicate magnitude of values */ 1312 /* first determine max of all nonzero values */ 1313 PetscReal minv=0.0,maxv=0.0; 1314 PetscInt count=0; 1315 PetscDraw popup; 1316 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1317 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1318 } 1319 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1320 PetscCall(PetscDrawGetPopup(draw,&popup)); 1321 PetscCall(PetscDrawScalePopup(popup,minv,maxv)); 1322 1323 ierr = PetscDrawCollectiveBegin(draw);PetscCall(ierr); 1324 for (i=0; i<m; i++) { 1325 shift = a->sliidx[i>>3]+(i&0x07); 1326 y_l = m - i - 1.0; 1327 y_r = y_l + 1.0; 1328 for (j=0; j<a->rlen[i]; j++) { 1329 x_l = a->colidx[shift+j*8]; 1330 x_r = x_l + 1.0; 1331 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv); 1332 PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color)); 1333 count++; 1334 } 1335 } 1336 ierr = PetscDrawCollectiveEnd(draw);PetscCall(ierr); 1337 } 1338 PetscFunctionReturn(0); 1339 } 1340 1341 #include <petscdraw.h> 1342 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer) 1343 { 1344 PetscDraw draw; 1345 PetscReal xr,yr,xl,yl,h,w; 1346 PetscBool isnull; 1347 1348 PetscFunctionBegin; 1349 PetscCall(PetscViewerDrawGetDraw(viewer,0,&draw)); 1350 PetscCall(PetscDrawIsNull(draw,&isnull)); 1351 if (isnull) PetscFunctionReturn(0); 1352 1353 xr = A->cmap->n; yr = A->rmap->n; h = yr/10.0; w = xr/10.0; 1354 xr += w; yr += h; xl = -w; yl = -h; 1355 PetscCall(PetscDrawSetCoordinates(draw,xl,yl,xr,yr)); 1356 PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer)); 1357 PetscCall(PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A)); 1358 PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL)); 1359 PetscCall(PetscDrawSave(draw)); 1360 PetscFunctionReturn(0); 1361 } 1362 1363 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer) 1364 { 1365 PetscBool iascii,isbinary,isdraw; 1366 1367 PetscFunctionBegin; 1368 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii)); 1369 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary)); 1370 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw)); 1371 if (iascii) { 1372 PetscCall(MatView_SeqSELL_ASCII(A,viewer)); 1373 } else if (isbinary) { 1374 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1375 } else if (isdraw) { 1376 PetscCall(MatView_SeqSELL_Draw(A,viewer)); 1377 } 1378 PetscFunctionReturn(0); 1379 } 1380 1381 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode) 1382 { 1383 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1384 PetscInt i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k; 1385 MatScalar *vp; 1386 1387 PetscFunctionBegin; 1388 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1389 /* To do: compress out the unused elements */ 1390 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1391 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz)); 1392 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs)); 1393 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax)); 1394 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1395 for (i=0; i<a->totalslices; ++i) { 1396 shift = a->sliidx[i]; /* starting index of the slice */ 1397 cp = a->colidx+shift; /* pointer to the column indices of the slice */ 1398 vp = a->val+shift; /* pointer to the nonzero values of the slice */ 1399 for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */ 1400 row = 8*i + row_in_slice; 1401 nrow = a->rlen[row]; /* number of nonzeros in row */ 1402 /* 1403 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1404 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1405 */ 1406 lastcol = 0; 1407 if (nrow>0) { /* nonempty row */ 1408 lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */ 1409 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1410 for (j=1;j<8;j++) { 1411 if (a->rlen[8*i+j]) { 1412 lastcol = cp[j]; 1413 break; 1414 } 1415 } 1416 } else { 1417 if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */ 1418 } 1419 1420 for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) { 1421 cp[8*k+row_in_slice] = lastcol; 1422 vp[8*k+row_in_slice] = (MatScalar)0; 1423 } 1424 } 1425 } 1426 1427 A->info.mallocs += a->reallocs; 1428 a->reallocs = 0; 1429 1430 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1431 PetscFunctionReturn(0); 1432 } 1433 1434 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info) 1435 { 1436 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1437 1438 PetscFunctionBegin; 1439 info->block_size = 1.0; 1440 info->nz_allocated = a->maxallocmat; 1441 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1442 info->nz_unneeded = (a->maxallocmat-a->sliidx[a->totalslices]); 1443 info->assemblies = A->num_ass; 1444 info->mallocs = A->info.mallocs; 1445 info->memory = ((PetscObject)A)->mem; 1446 if (A->factortype) { 1447 info->fill_ratio_given = A->info.fill_ratio_given; 1448 info->fill_ratio_needed = A->info.fill_ratio_needed; 1449 info->factor_mallocs = A->info.factor_mallocs; 1450 } else { 1451 info->fill_ratio_given = 0; 1452 info->fill_ratio_needed = 0; 1453 info->factor_mallocs = 0; 1454 } 1455 PetscFunctionReturn(0); 1456 } 1457 1458 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 1459 { 1460 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1461 PetscInt shift,i,k,l,low,high,t,ii,row,col,nrow; 1462 PetscInt *cp,nonew=a->nonew,lastcol=-1; 1463 MatScalar *vp,value; 1464 1465 PetscFunctionBegin; 1466 for (k=0; k<m; k++) { /* loop over added rows */ 1467 row = im[k]; 1468 if (row < 0) continue; 1469 PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1); 1470 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1471 cp = a->colidx+shift; /* pointer to the row */ 1472 vp = a->val+shift; /* pointer to the row */ 1473 nrow = a->rlen[row]; 1474 low = 0; 1475 high = nrow; 1476 1477 for (l=0; l<n; l++) { /* loop over added columns */ 1478 col = in[l]; 1479 if (col<0) continue; 1480 PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1); 1481 if (a->roworiented) { 1482 value = v[l+k*n]; 1483 } else { 1484 value = v[k+l*m]; 1485 } 1486 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1487 1488 /* search in this row for the specified column, i indicates the column to be set */ 1489 if (col <= lastcol) low = 0; 1490 else high = nrow; 1491 lastcol = col; 1492 while (high-low > 5) { 1493 t = (low+high)/2; 1494 if (*(cp+t*8) > col) high = t; 1495 else low = t; 1496 } 1497 for (i=low; i<high; i++) { 1498 if (*(cp+i*8) > col) break; 1499 if (*(cp+i*8) == col) { 1500 if (is == ADD_VALUES) *(vp+i*8) += value; 1501 else *(vp+i*8) = value; 1502 low = i + 1; 1503 goto noinsert; 1504 } 1505 } 1506 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1507 if (nonew == 1) goto noinsert; 1508 PetscCheckFalse(nonew == -1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1509 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1510 MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar); 1511 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1512 for (ii=nrow-1; ii>=i; ii--) { 1513 *(cp+(ii+1)*8) = *(cp+ii*8); 1514 *(vp+(ii+1)*8) = *(vp+ii*8); 1515 } 1516 a->rlen[row]++; 1517 *(cp+i*8) = col; 1518 *(vp+i*8) = value; 1519 a->nz++; 1520 A->nonzerostate++; 1521 low = i+1; high++; nrow++; 1522 noinsert:; 1523 } 1524 a->rlen[row] = nrow; 1525 } 1526 PetscFunctionReturn(0); 1527 } 1528 1529 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str) 1530 { 1531 PetscFunctionBegin; 1532 /* If the two matrices have the same copy implementation, use fast copy. */ 1533 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1534 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1535 Mat_SeqSELL *b=(Mat_SeqSELL*)B->data; 1536 1537 PetscCheckFalse(a->sliidx[a->totalslices] != b->sliidx[b->totalslices],PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different"); 1538 PetscCall(PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices])); 1539 } else { 1540 PetscCall(MatCopy_Basic(A,B,str)); 1541 } 1542 PetscFunctionReturn(0); 1543 } 1544 1545 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1546 { 1547 PetscFunctionBegin; 1548 PetscCall(MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL)); 1549 PetscFunctionReturn(0); 1550 } 1551 1552 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[]) 1553 { 1554 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1555 1556 PetscFunctionBegin; 1557 *array = a->val; 1558 PetscFunctionReturn(0); 1559 } 1560 1561 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[]) 1562 { 1563 PetscFunctionBegin; 1564 PetscFunctionReturn(0); 1565 } 1566 1567 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1568 { 1569 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1570 PetscInt i; 1571 MatScalar *aval=a->val; 1572 1573 PetscFunctionBegin; 1574 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]); 1575 PetscFunctionReturn(0); 1576 } 1577 1578 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1579 { 1580 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1581 PetscInt i; 1582 MatScalar *aval=a->val; 1583 1584 PetscFunctionBegin; 1585 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1586 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1587 PetscFunctionReturn(0); 1588 } 1589 1590 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha) 1591 { 1592 Mat_SeqSELL *a=(Mat_SeqSELL*)inA->data; 1593 MatScalar *aval=a->val; 1594 PetscScalar oalpha=alpha; 1595 PetscBLASInt one=1,size; 1596 1597 PetscFunctionBegin; 1598 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices],&size)); 1599 PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one)); 1600 PetscCall(PetscLogFlops(a->nz)); 1601 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1602 PetscFunctionReturn(0); 1603 } 1604 1605 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a) 1606 { 1607 Mat_SeqSELL *y=(Mat_SeqSELL*)Y->data; 1608 1609 PetscFunctionBegin; 1610 if (!Y->preallocated || !y->nz) { 1611 PetscCall(MatSeqSELLSetPreallocation(Y,1,NULL)); 1612 } 1613 PetscCall(MatShift_Basic(Y,a)); 1614 PetscFunctionReturn(0); 1615 } 1616 1617 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1618 { 1619 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1620 PetscScalar *x,sum,*t; 1621 const MatScalar *idiag=NULL,*mdiag; 1622 const PetscScalar *b,*xb; 1623 PetscInt n,m=A->rmap->n,i,j,shift; 1624 const PetscInt *diag; 1625 1626 PetscFunctionBegin; 1627 its = its*lits; 1628 1629 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1630 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A,omega,fshift)); 1631 a->fshift = fshift; 1632 a->omega = omega; 1633 1634 diag = a->diag; 1635 t = a->ssor_work; 1636 idiag = a->idiag; 1637 mdiag = a->mdiag; 1638 1639 PetscCall(VecGetArray(xx,&x)); 1640 PetscCall(VecGetArrayRead(bb,&b)); 1641 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1642 PetscCheckFalse(flag == SOR_APPLY_UPPER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented"); 1643 PetscCheckFalse(flag == SOR_APPLY_LOWER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented"); 1644 PetscCheckFalse(flag & SOR_EISENSTAT,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat"); 1645 1646 if (flag & SOR_ZERO_INITIAL_GUESS) { 1647 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1648 for (i=0; i<m; i++) { 1649 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1650 sum = b[i]; 1651 n = (diag[i]-shift)/8; 1652 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1653 t[i] = sum; 1654 x[i] = sum*idiag[i]; 1655 } 1656 xb = t; 1657 PetscCall(PetscLogFlops(a->nz)); 1658 } else xb = b; 1659 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1660 for (i=m-1; i>=0; i--) { 1661 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1662 sum = xb[i]; 1663 n = a->rlen[i]-(diag[i]-shift)/8-1; 1664 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1665 if (xb == b) { 1666 x[i] = sum*idiag[i]; 1667 } else { 1668 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1669 } 1670 } 1671 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1672 } 1673 its--; 1674 } 1675 while (its--) { 1676 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1677 for (i=0; i<m; i++) { 1678 /* lower */ 1679 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1680 sum = b[i]; 1681 n = (diag[i]-shift)/8; 1682 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1683 t[i] = sum; /* save application of the lower-triangular part */ 1684 /* upper */ 1685 n = a->rlen[i]-(diag[i]-shift)/8-1; 1686 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1687 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1688 } 1689 xb = t; 1690 PetscCall(PetscLogFlops(2.0*a->nz)); 1691 } else xb = b; 1692 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1693 for (i=m-1; i>=0; i--) { 1694 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1695 sum = xb[i]; 1696 if (xb == b) { 1697 /* whole matrix (no checkpointing available) */ 1698 n = a->rlen[i]; 1699 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1700 x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i]; 1701 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1702 n = a->rlen[i]-(diag[i]-shift)/8-1; 1703 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1704 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1705 } 1706 } 1707 if (xb == b) { 1708 PetscCall(PetscLogFlops(2.0*a->nz)); 1709 } else { 1710 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1711 } 1712 } 1713 } 1714 PetscCall(VecRestoreArray(xx,&x)); 1715 PetscCall(VecRestoreArrayRead(bb,&b)); 1716 PetscFunctionReturn(0); 1717 } 1718 1719 /* -------------------------------------------------------------------*/ 1720 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1721 MatGetRow_SeqSELL, 1722 MatRestoreRow_SeqSELL, 1723 MatMult_SeqSELL, 1724 /* 4*/ MatMultAdd_SeqSELL, 1725 MatMultTranspose_SeqSELL, 1726 MatMultTransposeAdd_SeqSELL, 1727 NULL, 1728 NULL, 1729 NULL, 1730 /* 10*/ NULL, 1731 NULL, 1732 NULL, 1733 MatSOR_SeqSELL, 1734 NULL, 1735 /* 15*/ MatGetInfo_SeqSELL, 1736 MatEqual_SeqSELL, 1737 MatGetDiagonal_SeqSELL, 1738 MatDiagonalScale_SeqSELL, 1739 NULL, 1740 /* 20*/ NULL, 1741 MatAssemblyEnd_SeqSELL, 1742 MatSetOption_SeqSELL, 1743 MatZeroEntries_SeqSELL, 1744 /* 24*/ NULL, 1745 NULL, 1746 NULL, 1747 NULL, 1748 NULL, 1749 /* 29*/ MatSetUp_SeqSELL, 1750 NULL, 1751 NULL, 1752 NULL, 1753 NULL, 1754 /* 34*/ MatDuplicate_SeqSELL, 1755 NULL, 1756 NULL, 1757 NULL, 1758 NULL, 1759 /* 39*/ NULL, 1760 NULL, 1761 NULL, 1762 MatGetValues_SeqSELL, 1763 MatCopy_SeqSELL, 1764 /* 44*/ NULL, 1765 MatScale_SeqSELL, 1766 MatShift_SeqSELL, 1767 NULL, 1768 NULL, 1769 /* 49*/ NULL, 1770 NULL, 1771 NULL, 1772 NULL, 1773 NULL, 1774 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1775 NULL, 1776 NULL, 1777 NULL, 1778 NULL, 1779 /* 59*/ NULL, 1780 MatDestroy_SeqSELL, 1781 MatView_SeqSELL, 1782 NULL, 1783 NULL, 1784 /* 64*/ NULL, 1785 NULL, 1786 NULL, 1787 NULL, 1788 NULL, 1789 /* 69*/ NULL, 1790 NULL, 1791 NULL, 1792 NULL, 1793 NULL, 1794 /* 74*/ NULL, 1795 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1796 NULL, 1797 NULL, 1798 NULL, 1799 /* 79*/ NULL, 1800 NULL, 1801 NULL, 1802 NULL, 1803 NULL, 1804 /* 84*/ NULL, 1805 NULL, 1806 NULL, 1807 NULL, 1808 NULL, 1809 /* 89*/ NULL, 1810 NULL, 1811 NULL, 1812 NULL, 1813 NULL, 1814 /* 94*/ NULL, 1815 NULL, 1816 NULL, 1817 NULL, 1818 NULL, 1819 /* 99*/ NULL, 1820 NULL, 1821 NULL, 1822 MatConjugate_SeqSELL, 1823 NULL, 1824 /*104*/ NULL, 1825 NULL, 1826 NULL, 1827 NULL, 1828 NULL, 1829 /*109*/ NULL, 1830 NULL, 1831 NULL, 1832 NULL, 1833 MatMissingDiagonal_SeqSELL, 1834 /*114*/ NULL, 1835 NULL, 1836 NULL, 1837 NULL, 1838 NULL, 1839 /*119*/ NULL, 1840 NULL, 1841 NULL, 1842 NULL, 1843 NULL, 1844 /*124*/ NULL, 1845 NULL, 1846 NULL, 1847 NULL, 1848 NULL, 1849 /*129*/ NULL, 1850 NULL, 1851 NULL, 1852 NULL, 1853 NULL, 1854 /*134*/ NULL, 1855 NULL, 1856 NULL, 1857 NULL, 1858 NULL, 1859 /*139*/ NULL, 1860 NULL, 1861 NULL, 1862 MatFDColoringSetUp_SeqXAIJ, 1863 NULL, 1864 /*144*/ NULL, 1865 NULL, 1866 NULL, 1867 NULL 1868 }; 1869 1870 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1871 { 1872 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1873 1874 PetscFunctionBegin; 1875 PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1876 1877 /* allocate space for values if not already there */ 1878 if (!a->saved_values) { 1879 PetscCall(PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values)); 1880 PetscCall(PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar))); 1881 } 1882 1883 /* copy values over */ 1884 PetscCall(PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices])); 1885 PetscFunctionReturn(0); 1886 } 1887 1888 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1889 { 1890 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1891 1892 PetscFunctionBegin; 1893 PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1894 PetscCheck(a->saved_values,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first"); 1895 PetscCall(PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices])); 1896 PetscFunctionReturn(0); 1897 } 1898 1899 /*@C 1900 MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray() 1901 1902 Not Collective 1903 1904 Input Parameters: 1905 . mat - a MATSEQSELL matrix 1906 . array - pointer to the data 1907 1908 Level: intermediate 1909 1910 .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90() 1911 @*/ 1912 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array) 1913 { 1914 PetscFunctionBegin; 1915 PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array)); 1916 PetscFunctionReturn(0); 1917 } 1918 1919 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 1920 { 1921 Mat_SeqSELL *b; 1922 PetscMPIInt size; 1923 1924 PetscFunctionBegin; 1925 PetscCall(PetscCitationsRegister(citation,&cited)); 1926 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size)); 1927 PetscCheckFalse(size > 1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1"); 1928 1929 PetscCall(PetscNewLog(B,&b)); 1930 1931 B->data = (void*)b; 1932 1933 PetscCall(PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps))); 1934 1935 b->row = NULL; 1936 b->col = NULL; 1937 b->icol = NULL; 1938 b->reallocs = 0; 1939 b->ignorezeroentries = PETSC_FALSE; 1940 b->roworiented = PETSC_TRUE; 1941 b->nonew = 0; 1942 b->diag = NULL; 1943 b->solve_work = NULL; 1944 B->spptr = NULL; 1945 b->saved_values = NULL; 1946 b->idiag = NULL; 1947 b->mdiag = NULL; 1948 b->ssor_work = NULL; 1949 b->omega = 1.0; 1950 b->fshift = 0.0; 1951 b->idiagvalid = PETSC_FALSE; 1952 b->keepnonzeropattern = PETSC_FALSE; 1953 1954 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL)); 1955 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL)); 1956 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL)); 1957 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL)); 1958 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL)); 1959 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL)); 1960 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ)); 1961 PetscFunctionReturn(0); 1962 } 1963 1964 /* 1965 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1966 */ 1967 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace) 1968 { 1969 Mat_SeqSELL *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data; 1970 PetscInt i,m=A->rmap->n; 1971 PetscInt totalslices=a->totalslices; 1972 1973 PetscFunctionBegin; 1974 C->factortype = A->factortype; 1975 c->row = NULL; 1976 c->col = NULL; 1977 c->icol = NULL; 1978 c->reallocs = 0; 1979 C->assembled = PETSC_TRUE; 1980 1981 PetscCall(PetscLayoutReference(A->rmap,&C->rmap)); 1982 PetscCall(PetscLayoutReference(A->cmap,&C->cmap)); 1983 1984 PetscCall(PetscMalloc1(8*totalslices,&c->rlen)); 1985 PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt))); 1986 PetscCall(PetscMalloc1(totalslices+1,&c->sliidx)); 1987 PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt))); 1988 1989 for (i=0; i<m; i++) c->rlen[i] = a->rlen[i]; 1990 for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i]; 1991 1992 /* allocate the matrix space */ 1993 if (mallocmatspace) { 1994 PetscCall(PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx)); 1995 PetscCall(PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)))); 1996 1997 c->singlemalloc = PETSC_TRUE; 1998 1999 if (m > 0) { 2000 PetscCall(PetscArraycpy(c->colidx,a->colidx,a->maxallocmat)); 2001 if (cpvalues == MAT_COPY_VALUES) { 2002 PetscCall(PetscArraycpy(c->val,a->val,a->maxallocmat)); 2003 } else { 2004 PetscCall(PetscArrayzero(c->val,a->maxallocmat)); 2005 } 2006 } 2007 } 2008 2009 c->ignorezeroentries = a->ignorezeroentries; 2010 c->roworiented = a->roworiented; 2011 c->nonew = a->nonew; 2012 if (a->diag) { 2013 PetscCall(PetscMalloc1(m,&c->diag)); 2014 PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt))); 2015 for (i=0; i<m; i++) { 2016 c->diag[i] = a->diag[i]; 2017 } 2018 } else c->diag = NULL; 2019 2020 c->solve_work = NULL; 2021 c->saved_values = NULL; 2022 c->idiag = NULL; 2023 c->ssor_work = NULL; 2024 c->keepnonzeropattern = a->keepnonzeropattern; 2025 c->free_val = PETSC_TRUE; 2026 c->free_colidx = PETSC_TRUE; 2027 2028 c->maxallocmat = a->maxallocmat; 2029 c->maxallocrow = a->maxallocrow; 2030 c->rlenmax = a->rlenmax; 2031 c->nz = a->nz; 2032 C->preallocated = PETSC_TRUE; 2033 2034 c->nonzerorowcnt = a->nonzerorowcnt; 2035 C->nonzerostate = A->nonzerostate; 2036 2037 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist)); 2038 PetscFunctionReturn(0); 2039 } 2040 2041 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B) 2042 { 2043 PetscFunctionBegin; 2044 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 2045 PetscCall(MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n)); 2046 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) { 2047 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 2048 } 2049 PetscCall(MatSetType(*B,((PetscObject)A)->type_name)); 2050 PetscCall(MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE)); 2051 PetscFunctionReturn(0); 2052 } 2053 2054 /*MC 2055 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2056 based on the sliced Ellpack format 2057 2058 Options Database Keys: 2059 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions() 2060 2061 Level: beginner 2062 2063 .seealso: MatCreateSeqSell(), MATSELL, MATMPISELL, MATSEQAIJ, MATAIJ, MATMPIAIJ 2064 M*/ 2065 2066 /*MC 2067 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices. 2068 2069 This matrix type is identical to MATSEQSELL when constructed with a single process communicator, 2070 and MATMPISELL otherwise. As a result, for single process communicators, 2071 MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported 2072 for communicators controlling multiple processes. It is recommended that you call both of 2073 the above preallocation routines for simplicity. 2074 2075 Options Database Keys: 2076 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2077 2078 Level: beginner 2079 2080 Notes: 2081 This format is only supported for real scalars, double precision, and 32 bit indices (the defaults). 2082 2083 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2084 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2085 2086 Developer Notes: 2087 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2088 2089 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2090 .vb 2091 (2 0 3 4) 2092 Consider the matrix A = (5 0 6 0) 2093 (0 0 7 8) 2094 (0 0 9 9) 2095 2096 symbolically the Ellpack format can be written as 2097 2098 (2 3 4 |) (0 2 3 |) 2099 v = (5 6 0 |) colidx = (0 2 2 |) 2100 -------- --------- 2101 (7 8 |) (2 3 |) 2102 (9 9 |) (2 3 |) 2103 2104 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2105 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2106 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2107 2108 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2109 2110 .ve 2111 2112 See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance. 2113 2114 References: 2115 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}, 2116 Proceedings of the 47th International Conference on Parallel Processing, 2018. 2117 2118 .seealso: MatCreateSeqSELL(), MatCreateSeqAIJ(), MatCreateSell(), MATSEQSELL, MATMPISELL, MATSEQAIJ, MATMPIAIJ, MATAIJ 2119 M*/ 2120 2121 /*@C 2122 MatCreateSeqSELL - Creates a sparse matrix in SELL format. 2123 2124 Collective on comm 2125 2126 Input Parameters: 2127 + comm - MPI communicator, set to PETSC_COMM_SELF 2128 . m - number of rows 2129 . n - number of columns 2130 . rlenmax - maximum number of nonzeros in a row 2131 - rlen - array containing the number of nonzeros in the various rows 2132 (possibly different for each row) or NULL 2133 2134 Output Parameter: 2135 . A - the matrix 2136 2137 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2138 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2139 [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation] 2140 2141 Notes: 2142 If nnz is given then nz is ignored 2143 2144 Specify the preallocated storage with either rlenmax or rlen (not both). 2145 Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory 2146 allocation. For large problems you MUST preallocate memory or you 2147 will get TERRIBLE performance, see the users' manual chapter on matrices. 2148 2149 Level: intermediate 2150 2151 .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatSeqSELLSetPreallocation(), MATSELL, MATSEQSELL, MATMPISELL 2152 2153 @*/ 2154 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A) 2155 { 2156 PetscFunctionBegin; 2157 PetscCall(MatCreate(comm,A)); 2158 PetscCall(MatSetSizes(*A,m,n,m,n)); 2159 PetscCall(MatSetType(*A,MATSEQSELL)); 2160 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen)); 2161 PetscFunctionReturn(0); 2162 } 2163 2164 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg) 2165 { 2166 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data; 2167 PetscInt totalslices=a->totalslices; 2168 2169 PetscFunctionBegin; 2170 /* If the matrix dimensions are not equal,or no of nonzeros */ 2171 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2172 *flg = PETSC_FALSE; 2173 PetscFunctionReturn(0); 2174 } 2175 /* if the a->colidx are the same */ 2176 PetscCall(PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg)); 2177 if (!*flg) PetscFunctionReturn(0); 2178 /* if a->val are the same */ 2179 PetscCall(PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg)); 2180 PetscFunctionReturn(0); 2181 } 2182 2183 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2184 { 2185 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2186 2187 PetscFunctionBegin; 2188 a->idiagvalid = PETSC_FALSE; 2189 PetscFunctionReturn(0); 2190 } 2191 2192 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2193 { 2194 #if defined(PETSC_USE_COMPLEX) 2195 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2196 PetscInt i; 2197 PetscScalar *val = a->val; 2198 2199 PetscFunctionBegin; 2200 for (i=0; i<a->sliidx[a->totalslices]; i++) { 2201 val[i] = PetscConj(val[i]); 2202 } 2203 #else 2204 PetscFunctionBegin; 2205 #endif 2206 PetscFunctionReturn(0); 2207 } 2208