1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 9 10 #include <immintrin.h> 11 12 #if !defined(_MM_SCALE_8) 13 #define _MM_SCALE_8 8 14 #endif 15 16 #if defined(__AVX512F__) 17 /* these do not work 18 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 19 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 20 */ 21 #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 22 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 23 vec_idx = _mm256_loadu_si256((__m256i const*)acolidx); \ 24 vec_vals = _mm512_loadu_pd(aval); \ 25 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \ 26 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y) 27 #elif defined(__AVX2__) && defined(__FMA__) 28 #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 29 vec_vals = _mm256_loadu_pd(aval); \ 30 vec_idx = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \ 31 vec_x = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \ 32 vec_y = _mm256_fmadd_pd(vec_x,vec_vals,vec_y) 33 #endif 34 #endif /* PETSC_HAVE_IMMINTRIN_H */ 35 36 /*@C 37 MatSeqSELLSetPreallocation - For good matrix assembly performance 38 the user should preallocate the matrix storage by setting the parameter nz 39 (or the array nnz). By setting these parameters accurately, performance 40 during matrix assembly can be increased significantly. 41 42 Collective 43 44 Input Parameters: 45 + B - The matrix 46 . nz - number of nonzeros per row (same for all rows) 47 - nnz - array containing the number of nonzeros in the various rows 48 (possibly different for each row) or NULL 49 50 Notes: 51 If nnz is given then nz is ignored. 52 53 Specify the preallocated storage with either nz or nnz (not both). 54 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 55 allocation. For large problems you MUST preallocate memory or you 56 will get TERRIBLE performance, see the users' manual chapter on matrices. 57 58 You can call MatGetInfo() to get information on how effective the preallocation was; 59 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 60 You can also run with the option -info and look for messages with the string 61 malloc in them to see if additional memory allocation was needed. 62 63 Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix 64 entries or columns indices. 65 66 The maximum number of nonzeos in any row should be as accurate as possible. 67 If it is underestimated, you will get bad performance due to reallocation 68 (MatSeqXSELLReallocateSELL). 69 70 Level: intermediate 71 72 .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo() 73 74 @*/ 75 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[]) 76 { 77 PetscErrorCode ierr; 78 79 PetscFunctionBegin; 80 PetscValidHeaderSpecific(B,MAT_CLASSID,1); 81 PetscValidType(B,1); 82 ierr = PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));CHKERRQ(ierr); 83 PetscFunctionReturn(0); 84 } 85 86 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[]) 87 { 88 Mat_SeqSELL *b; 89 PetscInt i,j,totalslices; 90 PetscBool skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE; 91 PetscErrorCode ierr; 92 93 PetscFunctionBegin; 94 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 95 if (maxallocrow == MAT_SKIP_ALLOCATION) { 96 skipallocation = PETSC_TRUE; 97 maxallocrow = 0; 98 } 99 100 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 101 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 102 103 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 104 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 105 if (maxallocrow < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %D",maxallocrow); 106 if (rlen) { 107 for (i=0; i<B->rmap->n; i++) { 108 if (rlen[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %D value %D",i,rlen[i]); 109 if (rlen[i] > B->cmap->n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %D value %D rowlength %D",i,rlen[i],B->cmap->n); 110 } 111 } 112 113 B->preallocated = PETSC_TRUE; 114 115 b = (Mat_SeqSELL*)B->data; 116 117 totalslices = B->rmap->n/8+((B->rmap->n & 0x07)?1:0); /* ceil(n/8) */ 118 b->totalslices = totalslices; 119 if (!skipallocation) { 120 if (B->rmap->n & 0x07) {ierr = PetscInfo1(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %D)\n",B->rmap->n);CHKERRQ(ierr);} 121 122 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 123 ierr = PetscMalloc1(totalslices+1,&b->sliidx);CHKERRQ(ierr); 124 ierr = PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr); 125 } 126 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 127 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 128 else if (maxallocrow < 0) maxallocrow = 1; 129 for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow; 130 } else { 131 maxallocrow = 0; 132 b->sliidx[0] = 0; 133 for (i=1; i<totalslices; i++) { 134 b->sliidx[i] = 0; 135 for (j=0;j<8;j++) { 136 b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]); 137 } 138 maxallocrow = PetscMax(b->sliidx[i],maxallocrow); 139 ierr = PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i]);CHKERRQ(ierr); 140 } 141 /* last slice */ 142 b->sliidx[totalslices] = 0; 143 for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]); 144 maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow); 145 b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices]; 146 } 147 148 /* allocate space for val, colidx, rlen */ 149 /* FIXME: should B's old memory be unlogged? */ 150 ierr = MatSeqXSELLFreeSELL(B,&b->val,&b->colidx);CHKERRQ(ierr); 151 /* FIXME: assuming an element of the bit array takes 8 bits */ 152 ierr = PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx);CHKERRQ(ierr); 153 ierr = PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr); 154 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 155 ierr = PetscCalloc1(8*totalslices,&b->rlen);CHKERRQ(ierr); 156 ierr = PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt));CHKERRQ(ierr); 157 158 b->singlemalloc = PETSC_TRUE; 159 b->free_val = PETSC_TRUE; 160 b->free_colidx = PETSC_TRUE; 161 } else { 162 b->free_val = PETSC_FALSE; 163 b->free_colidx = PETSC_FALSE; 164 } 165 166 b->nz = 0; 167 b->maxallocrow = maxallocrow; 168 b->rlenmax = maxallocrow; 169 b->maxallocmat = b->sliidx[totalslices]; 170 B->info.nz_unneeded = (double)b->maxallocmat; 171 if (realalloc) { 172 ierr = MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 173 } 174 PetscFunctionReturn(0); 175 } 176 177 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 178 { 179 Mat_SeqSELL *a = (Mat_SeqSELL*)A->data; 180 PetscInt shift; 181 182 PetscFunctionBegin; 183 if (row < 0 || row >= A->rmap->n) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %D out of range",row); 184 if (nz) *nz = a->rlen[row]; 185 shift = a->sliidx[row>>3]+(row&0x07); 186 if (!a->getrowcols) { 187 PetscErrorCode ierr; 188 189 ierr = PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals);CHKERRQ(ierr); 190 } 191 if (idx) { 192 PetscInt j; 193 for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j]; 194 *idx = a->getrowcols; 195 } 196 if (v) { 197 PetscInt j; 198 for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j]; 199 *v = a->getrowvals; 200 } 201 PetscFunctionReturn(0); 202 } 203 204 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 205 { 206 PetscFunctionBegin; 207 PetscFunctionReturn(0); 208 } 209 210 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat) 211 { 212 Mat B; 213 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 214 PetscInt i; 215 PetscErrorCode ierr; 216 217 PetscFunctionBegin; 218 if (reuse == MAT_REUSE_MATRIX) { 219 B = *newmat; 220 ierr = MatZeroEntries(B);CHKERRQ(ierr); 221 } else { 222 ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr); 223 ierr = MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N);CHKERRQ(ierr); 224 ierr = MatSetType(B,MATSEQAIJ);CHKERRQ(ierr); 225 ierr = MatSeqAIJSetPreallocation(B,0,a->rlen);CHKERRQ(ierr); 226 } 227 228 for (i=0; i<A->rmap->n; i++) { 229 PetscInt nz = 0,*cols = NULL; 230 PetscScalar *vals = NULL; 231 232 ierr = MatGetRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr); 233 ierr = MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES);CHKERRQ(ierr); 234 ierr = MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr); 235 } 236 237 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 238 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 239 B->rmap->bs = A->rmap->bs; 240 241 if (reuse == MAT_INPLACE_MATRIX) { 242 ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr); 243 } else { 244 *newmat = B; 245 } 246 PetscFunctionReturn(0); 247 } 248 249 #include <../src/mat/impls/aij/seq/aij.h> 250 251 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat) 252 { 253 Mat B; 254 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 255 PetscInt *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols; 256 const PetscInt *cols; 257 const PetscScalar *vals; 258 PetscErrorCode ierr; 259 260 PetscFunctionBegin; 261 262 if (reuse == MAT_REUSE_MATRIX) { 263 B = *newmat; 264 } else { 265 /* Can we just use ilen? */ 266 ierr = PetscMalloc1(m,&rowlengths);CHKERRQ(ierr); 267 for (i=0; i<m; i++) { 268 rowlengths[i] = ai[i+1] - ai[i]; 269 } 270 271 ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr); 272 ierr = MatSetSizes(B,m,n,m,n);CHKERRQ(ierr); 273 ierr = MatSetType(B,MATSEQSELL);CHKERRQ(ierr); 274 ierr = MatSeqSELLSetPreallocation(B,0,rowlengths);CHKERRQ(ierr); 275 ierr = PetscFree(rowlengths);CHKERRQ(ierr); 276 } 277 278 for (row=0; row<m; row++) { 279 ierr = MatGetRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr); 280 ierr = MatSetValues(B,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr); 281 ierr = MatRestoreRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr); 282 } 283 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 284 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 285 B->rmap->bs = A->rmap->bs; 286 287 if (reuse == MAT_INPLACE_MATRIX) { 288 ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr); 289 } else { 290 *newmat = B; 291 } 292 PetscFunctionReturn(0); 293 } 294 295 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy) 296 { 297 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 298 PetscScalar *y; 299 const PetscScalar *x; 300 const MatScalar *aval=a->val; 301 PetscInt totalslices=a->totalslices; 302 const PetscInt *acolidx=a->colidx; 303 PetscInt i,j; 304 PetscErrorCode ierr; 305 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 306 __m512d vec_x,vec_y,vec_vals; 307 __m256i vec_idx; 308 __mmask8 mask; 309 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 310 __m256i vec_idx2,vec_idx3,vec_idx4; 311 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 312 __m128i vec_idx; 313 __m256d vec_x,vec_y,vec_y2,vec_vals; 314 MatScalar yval; 315 PetscInt r,rows_left,row,nnz_in_row; 316 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 317 __m128d vec_x_tmp; 318 __m256d vec_x,vec_y,vec_y2,vec_vals; 319 MatScalar yval; 320 PetscInt r,rows_left,row,nnz_in_row; 321 #else 322 PetscScalar sum[8]; 323 #endif 324 325 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 326 #pragma disjoint(*x,*y,*aval) 327 #endif 328 329 PetscFunctionBegin; 330 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 331 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 332 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 333 for (i=0; i<totalslices; i++) { /* loop over slices */ 334 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 335 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 336 337 vec_y = _mm512_setzero_pd(); 338 vec_y2 = _mm512_setzero_pd(); 339 vec_y3 = _mm512_setzero_pd(); 340 vec_y4 = _mm512_setzero_pd(); 341 342 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 343 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 344 case 3: 345 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 346 acolidx += 8; aval += 8; 347 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 348 acolidx += 8; aval += 8; 349 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 350 acolidx += 8; aval += 8; 351 j += 3; 352 break; 353 case 2: 354 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 355 acolidx += 8; aval += 8; 356 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 357 acolidx += 8; aval += 8; 358 j += 2; 359 break; 360 case 1: 361 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 362 acolidx += 8; aval += 8; 363 j += 1; 364 break; 365 } 366 #pragma novector 367 for (; j<(a->sliidx[i+1]>>3); j+=4) { 368 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 369 acolidx += 8; aval += 8; 370 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 371 acolidx += 8; aval += 8; 372 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 373 acolidx += 8; aval += 8; 374 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 375 acolidx += 8; aval += 8; 376 } 377 378 vec_y = _mm512_add_pd(vec_y,vec_y2); 379 vec_y = _mm512_add_pd(vec_y,vec_y3); 380 vec_y = _mm512_add_pd(vec_y,vec_y4); 381 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 382 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 383 _mm512_mask_storeu_pd(&y[8*i],mask,vec_y); 384 } else { 385 _mm512_storeu_pd(&y[8*i],vec_y); 386 } 387 } 388 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 389 for (i=0; i<totalslices; i++) { /* loop over full slices */ 390 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 391 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 392 393 /* last slice may have padding rows. Don't use vectorization. */ 394 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 395 rows_left = A->rmap->n - 8*i; 396 for (r=0; r<rows_left; ++r) { 397 yval = (MatScalar)0; 398 row = 8*i + r; 399 nnz_in_row = a->rlen[row]; 400 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 401 y[row] = yval; 402 } 403 break; 404 } 405 406 vec_y = _mm256_setzero_pd(); 407 vec_y2 = _mm256_setzero_pd(); 408 409 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 410 #pragma novector 411 #pragma unroll(2) 412 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 413 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 414 aval += 4; acolidx += 4; 415 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2); 416 aval += 4; acolidx += 4; 417 } 418 419 _mm256_storeu_pd(y+i*8,vec_y); 420 _mm256_storeu_pd(y+i*8+4,vec_y2); 421 } 422 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 423 for (i=0; i<totalslices; i++) { /* loop over full slices */ 424 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 425 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 426 427 vec_y = _mm256_setzero_pd(); 428 vec_y2 = _mm256_setzero_pd(); 429 430 /* last slice may have padding rows. Don't use vectorization. */ 431 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 432 rows_left = A->rmap->n - 8*i; 433 for (r=0; r<rows_left; ++r) { 434 yval = (MatScalar)0; 435 row = 8*i + r; 436 nnz_in_row = a->rlen[row]; 437 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]]; 438 y[row] = yval; 439 } 440 break; 441 } 442 443 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 444 #pragma novector 445 #pragma unroll(2) 446 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 447 vec_vals = _mm256_loadu_pd(aval); 448 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 449 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 450 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 451 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 452 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 453 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 454 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 455 aval += 4; 456 457 vec_vals = _mm256_loadu_pd(aval); 458 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 459 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 460 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 461 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 462 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 463 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 464 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 465 aval += 4; 466 } 467 468 _mm256_storeu_pd(y + i*8, vec_y); 469 _mm256_storeu_pd(y + i*8 + 4, vec_y2); 470 } 471 #else 472 for (i=0; i<totalslices; i++) { /* loop over slices */ 473 for (j=0; j<8; j++) sum[j] = 0.0; 474 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 475 sum[0] += aval[j] * x[acolidx[j]]; 476 sum[1] += aval[j+1] * x[acolidx[j+1]]; 477 sum[2] += aval[j+2] * x[acolidx[j+2]]; 478 sum[3] += aval[j+3] * x[acolidx[j+3]]; 479 sum[4] += aval[j+4] * x[acolidx[j+4]]; 480 sum[5] += aval[j+5] * x[acolidx[j+5]]; 481 sum[6] += aval[j+6] * x[acolidx[j+6]]; 482 sum[7] += aval[j+7] * x[acolidx[j+7]]; 483 } 484 if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 485 for(j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j]; 486 } else { 487 for(j=0; j<8; j++) y[8*i+j] = sum[j]; 488 } 489 } 490 #endif 491 492 ierr = PetscLogFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); /* theoretical minimal FLOPs */ 493 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 494 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 495 PetscFunctionReturn(0); 496 } 497 498 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 499 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz) 500 { 501 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 502 PetscScalar *y,*z; 503 const PetscScalar *x; 504 const MatScalar *aval=a->val; 505 PetscInt totalslices=a->totalslices; 506 const PetscInt *acolidx=a->colidx; 507 PetscInt i,j; 508 PetscErrorCode ierr; 509 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 510 __m512d vec_x,vec_y,vec_vals; 511 __m256i vec_idx; 512 __mmask8 mask; 513 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 514 __m256i vec_idx2,vec_idx3,vec_idx4; 515 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 516 __m128d vec_x_tmp; 517 __m256d vec_x,vec_y,vec_y2,vec_vals; 518 MatScalar yval; 519 PetscInt r,row,nnz_in_row; 520 #else 521 PetscScalar sum[8]; 522 #endif 523 524 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 525 #pragma disjoint(*x,*y,*aval) 526 #endif 527 528 PetscFunctionBegin; 529 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 530 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 531 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 532 for (i=0; i<totalslices; i++) { /* loop over slices */ 533 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 534 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 535 536 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 537 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 538 vec_y = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]); 539 } else { 540 vec_y = _mm512_loadu_pd(&y[8*i]); 541 } 542 vec_y2 = _mm512_setzero_pd(); 543 vec_y3 = _mm512_setzero_pd(); 544 vec_y4 = _mm512_setzero_pd(); 545 546 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 547 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 548 case 3: 549 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 550 acolidx += 8; aval += 8; 551 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 552 acolidx += 8; aval += 8; 553 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 554 acolidx += 8; aval += 8; 555 j += 3; 556 break; 557 case 2: 558 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 559 acolidx += 8; aval += 8; 560 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 561 acolidx += 8; aval += 8; 562 j += 2; 563 break; 564 case 1: 565 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 566 acolidx += 8; aval += 8; 567 j += 1; 568 break; 569 } 570 #pragma novector 571 for (; j<(a->sliidx[i+1]>>3); j+=4) { 572 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 573 acolidx += 8; aval += 8; 574 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 575 acolidx += 8; aval += 8; 576 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 577 acolidx += 8; aval += 8; 578 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 579 acolidx += 8; aval += 8; 580 } 581 582 vec_y = _mm512_add_pd(vec_y,vec_y2); 583 vec_y = _mm512_add_pd(vec_y,vec_y3); 584 vec_y = _mm512_add_pd(vec_y,vec_y4); 585 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 586 _mm512_mask_storeu_pd(&z[8*i],mask,vec_y); 587 } else { 588 _mm512_storeu_pd(&z[8*i],vec_y); 589 } 590 } 591 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 592 for (i=0; i<totalslices; i++) { /* loop over full slices */ 593 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 594 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 595 596 /* last slice may have padding rows. Don't use vectorization. */ 597 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 598 for (r=0; r<(A->rmap->n & 0x07); ++r) { 599 row = 8*i + r; 600 yval = (MatScalar)0.0; 601 nnz_in_row = a->rlen[row]; 602 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 603 z[row] = y[row] + yval; 604 } 605 break; 606 } 607 608 vec_y = _mm256_loadu_pd(y+8*i); 609 vec_y2 = _mm256_loadu_pd(y+8*i+4); 610 611 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 612 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 613 vec_vals = _mm256_loadu_pd(aval); 614 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 615 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 616 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 617 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 618 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 619 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 620 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 621 aval += 4; 622 623 vec_vals = _mm256_loadu_pd(aval); 624 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 625 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 626 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 627 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 628 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 629 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 630 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 631 aval += 4; 632 } 633 634 _mm256_storeu_pd(z+i*8,vec_y); 635 _mm256_storeu_pd(z+i*8+4,vec_y2); 636 } 637 #else 638 for (i=0; i<totalslices; i++) { /* loop over slices */ 639 for (j=0; j<8; j++) sum[j] = 0.0; 640 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 641 sum[0] += aval[j] * x[acolidx[j]]; 642 sum[1] += aval[j+1] * x[acolidx[j+1]]; 643 sum[2] += aval[j+2] * x[acolidx[j+2]]; 644 sum[3] += aval[j+3] * x[acolidx[j+3]]; 645 sum[4] += aval[j+4] * x[acolidx[j+4]]; 646 sum[5] += aval[j+5] * x[acolidx[j+5]]; 647 sum[6] += aval[j+6] * x[acolidx[j+6]]; 648 sum[7] += aval[j+7] * x[acolidx[j+7]]; 649 } 650 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 651 for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j]; 652 } else { 653 for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j]; 654 } 655 } 656 #endif 657 658 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 659 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 660 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 661 PetscFunctionReturn(0); 662 } 663 664 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy) 665 { 666 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 667 PetscScalar *y; 668 const PetscScalar *x; 669 const MatScalar *aval=a->val; 670 const PetscInt *acolidx=a->colidx; 671 PetscInt i,j,r,row,nnz_in_row,totalslices=a->totalslices; 672 PetscErrorCode ierr; 673 674 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 675 #pragma disjoint(*x,*y,*aval) 676 #endif 677 678 PetscFunctionBegin; 679 if (A->symmetric) { 680 ierr = MatMultAdd_SeqSELL(A,xx,zz,yy);CHKERRQ(ierr); 681 PetscFunctionReturn(0); 682 } 683 if (zz != yy) { ierr = VecCopy(zz,yy);CHKERRQ(ierr); } 684 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 685 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 686 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 687 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 688 for (r=0; r<(A->rmap->n & 0x07); ++r) { 689 row = 8*i + r; 690 nnz_in_row = a->rlen[row]; 691 for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row]; 692 } 693 break; 694 } 695 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 696 y[acolidx[j]] += aval[j] * x[8*i]; 697 y[acolidx[j+1]] += aval[j+1] * x[8*i+1]; 698 y[acolidx[j+2]] += aval[j+2] * x[8*i+2]; 699 y[acolidx[j+3]] += aval[j+3] * x[8*i+3]; 700 y[acolidx[j+4]] += aval[j+4] * x[8*i+4]; 701 y[acolidx[j+5]] += aval[j+5] * x[8*i+5]; 702 y[acolidx[j+6]] += aval[j+6] * x[8*i+6]; 703 y[acolidx[j+7]] += aval[j+7] * x[8*i+7]; 704 } 705 } 706 ierr = PetscLogFlops(2.0*a->sliidx[a->totalslices]);CHKERRQ(ierr); 707 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 708 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 709 PetscFunctionReturn(0); 710 } 711 712 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy) 713 { 714 PetscErrorCode ierr; 715 716 PetscFunctionBegin; 717 if (A->symmetric) { 718 ierr = MatMult_SeqSELL(A,xx,yy);CHKERRQ(ierr); 719 } else { 720 ierr = VecSet(yy,0.0);CHKERRQ(ierr); 721 ierr = MatMultTransposeAdd_SeqSELL(A,xx,yy,yy);CHKERRQ(ierr); 722 } 723 PetscFunctionReturn(0); 724 } 725 726 /* 727 Checks for missing diagonals 728 */ 729 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool *missing,PetscInt *d) 730 { 731 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 732 PetscInt *diag,i; 733 PetscErrorCode ierr; 734 735 PetscFunctionBegin; 736 *missing = PETSC_FALSE; 737 if (A->rmap->n > 0 && !(a->colidx)) { 738 *missing = PETSC_TRUE; 739 if (d) *d = 0; 740 ierr = PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");CHKERRQ(ierr); 741 } else { 742 diag = a->diag; 743 for (i=0; i<A->rmap->n; i++) { 744 if (diag[i] == -1) { 745 *missing = PETSC_TRUE; 746 if (d) *d = i; 747 ierr = PetscInfo1(A,"Matrix is missing diagonal number %D\n",i);CHKERRQ(ierr); 748 break; 749 } 750 } 751 } 752 PetscFunctionReturn(0); 753 } 754 755 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 756 { 757 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 758 PetscInt i,j,m=A->rmap->n,shift; 759 PetscErrorCode ierr; 760 761 PetscFunctionBegin; 762 if (!a->diag) { 763 ierr = PetscMalloc1(m,&a->diag);CHKERRQ(ierr); 764 ierr = PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));CHKERRQ(ierr); 765 a->free_diag = PETSC_TRUE; 766 } 767 for (i=0; i<m; i++) { /* loop over rows */ 768 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 769 a->diag[i] = -1; 770 for (j=0; j<a->rlen[i]; j++) { 771 if (a->colidx[shift+j*8] == i) { 772 a->diag[i] = shift+j*8; 773 break; 774 } 775 } 776 } 777 PetscFunctionReturn(0); 778 } 779 780 /* 781 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 782 */ 783 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift) 784 { 785 Mat_SeqSELL *a=(Mat_SeqSELL*) A->data; 786 PetscInt i,*diag,m = A->rmap->n; 787 MatScalar *val = a->val; 788 PetscScalar *idiag,*mdiag; 789 PetscErrorCode ierr; 790 791 PetscFunctionBegin; 792 if (a->idiagvalid) PetscFunctionReturn(0); 793 ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr); 794 diag = a->diag; 795 if (!a->idiag) { 796 ierr = PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work);CHKERRQ(ierr); 797 ierr = PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar));CHKERRQ(ierr); 798 val = a->val; 799 } 800 mdiag = a->mdiag; 801 idiag = a->idiag; 802 803 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 804 for (i=0; i<m; i++) { 805 mdiag[i] = val[diag[i]]; 806 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 807 if (PetscRealPart(fshift)) { 808 ierr = PetscInfo1(A,"Zero diagonal on row %D\n",i);CHKERRQ(ierr); 809 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 810 A->factorerror_zeropivot_value = 0.0; 811 A->factorerror_zeropivot_row = i; 812 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %D",i); 813 } 814 idiag[i] = 1.0/val[diag[i]]; 815 } 816 ierr = PetscLogFlops(m);CHKERRQ(ierr); 817 } else { 818 for (i=0; i<m; i++) { 819 mdiag[i] = val[diag[i]]; 820 idiag[i] = omega/(fshift + val[diag[i]]); 821 } 822 ierr = PetscLogFlops(2.0*m);CHKERRQ(ierr); 823 } 824 a->idiagvalid = PETSC_TRUE; 825 PetscFunctionReturn(0); 826 } 827 828 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 829 { 830 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 831 PetscErrorCode ierr; 832 833 PetscFunctionBegin; 834 ierr = PetscArrayzero(a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr); 835 ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr); 836 PetscFunctionReturn(0); 837 } 838 839 PetscErrorCode MatDestroy_SeqSELL(Mat A) 840 { 841 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 842 PetscErrorCode ierr; 843 844 PetscFunctionBegin; 845 #if defined(PETSC_USE_LOG) 846 PetscLogObjectState((PetscObject)A,"Rows=%D, Cols=%D, NZ=%D",A->rmap->n,A->cmap->n,a->nz); 847 #endif 848 ierr = MatSeqXSELLFreeSELL(A,&a->val,&a->colidx);CHKERRQ(ierr); 849 ierr = ISDestroy(&a->row);CHKERRQ(ierr); 850 ierr = ISDestroy(&a->col);CHKERRQ(ierr); 851 ierr = PetscFree(a->diag);CHKERRQ(ierr); 852 ierr = PetscFree(a->rlen);CHKERRQ(ierr); 853 ierr = PetscFree(a->sliidx);CHKERRQ(ierr); 854 ierr = PetscFree3(a->idiag,a->mdiag,a->ssor_work);CHKERRQ(ierr); 855 ierr = PetscFree(a->solve_work);CHKERRQ(ierr); 856 ierr = ISDestroy(&a->icol);CHKERRQ(ierr); 857 ierr = PetscFree(a->saved_values);CHKERRQ(ierr); 858 ierr = PetscFree2(a->getrowcols,a->getrowvals);CHKERRQ(ierr); 859 860 ierr = PetscFree(A->data);CHKERRQ(ierr); 861 862 ierr = PetscObjectChangeTypeName((PetscObject)A,0);CHKERRQ(ierr); 863 ierr = PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);CHKERRQ(ierr); 864 ierr = PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);CHKERRQ(ierr); 865 #if defined(PETSC_HAVE_ELEMENTAL) 866 #endif 867 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL);CHKERRQ(ierr); 868 PetscFunctionReturn(0); 869 } 870 871 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg) 872 { 873 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 874 PetscErrorCode ierr; 875 876 PetscFunctionBegin; 877 switch (op) { 878 case MAT_ROW_ORIENTED: 879 a->roworiented = flg; 880 break; 881 case MAT_KEEP_NONZERO_PATTERN: 882 a->keepnonzeropattern = flg; 883 break; 884 case MAT_NEW_NONZERO_LOCATIONS: 885 a->nonew = (flg ? 0 : 1); 886 break; 887 case MAT_NEW_NONZERO_LOCATION_ERR: 888 a->nonew = (flg ? -1 : 0); 889 break; 890 case MAT_NEW_NONZERO_ALLOCATION_ERR: 891 a->nonew = (flg ? -2 : 0); 892 break; 893 case MAT_UNUSED_NONZERO_LOCATION_ERR: 894 a->nounused = (flg ? -1 : 0); 895 break; 896 case MAT_NEW_DIAGONALS: 897 case MAT_IGNORE_OFF_PROC_ENTRIES: 898 case MAT_USE_HASH_TABLE: 899 case MAT_SORTED_FULL: 900 ierr = PetscInfo1(A,"Option %s ignored\n",MatOptions[op]);CHKERRQ(ierr); 901 break; 902 case MAT_SPD: 903 case MAT_SYMMETRIC: 904 case MAT_STRUCTURALLY_SYMMETRIC: 905 case MAT_HERMITIAN: 906 case MAT_SYMMETRY_ETERNAL: 907 /* These options are handled directly by MatSetOption() */ 908 break; 909 default: 910 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op); 911 } 912 PetscFunctionReturn(0); 913 } 914 915 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v) 916 { 917 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 918 PetscInt i,j,n,shift; 919 PetscScalar *x,zero=0.0; 920 PetscErrorCode ierr; 921 922 PetscFunctionBegin; 923 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 924 if (n != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 925 926 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 927 PetscInt *diag=a->diag; 928 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 929 for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]]; 930 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 931 PetscFunctionReturn(0); 932 } 933 934 ierr = VecSet(v,zero);CHKERRQ(ierr); 935 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 936 for (i=0; i<n; i++) { /* loop over rows */ 937 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 938 x[i] = 0; 939 for (j=0; j<a->rlen[i]; j++) { 940 if (a->colidx[shift+j*8] == i) { 941 x[i] = a->val[shift+j*8]; 942 break; 943 } 944 } 945 } 946 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 947 PetscFunctionReturn(0); 948 } 949 950 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr) 951 { 952 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 953 const PetscScalar *l,*r; 954 PetscInt i,j,m,n,row; 955 PetscErrorCode ierr; 956 957 PetscFunctionBegin; 958 if (ll) { 959 /* The local size is used so that VecMPI can be passed to this routine 960 by MatDiagonalScale_MPISELL */ 961 ierr = VecGetLocalSize(ll,&m);CHKERRQ(ierr); 962 if (m != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 963 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 964 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 965 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 966 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 967 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row]; 968 } 969 } else { 970 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 971 a->val[j] *= l[8*i+row]; 972 } 973 } 974 } 975 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 976 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 977 } 978 if (rr) { 979 ierr = VecGetLocalSize(rr,&n);CHKERRQ(ierr); 980 if (n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 981 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 982 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 983 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 984 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 985 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 986 } 987 } else { 988 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) { 989 a->val[j] *= r[a->colidx[j]]; 990 } 991 } 992 } 993 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 994 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 995 } 996 ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr); 997 PetscFunctionReturn(0); 998 } 999 1000 extern PetscErrorCode MatSetValues_SeqSELL(Mat,PetscInt,const PetscInt[],PetscInt,const PetscInt[],const PetscScalar[],InsertMode); 1001 1002 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[]) 1003 { 1004 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1005 PetscInt *cp,i,k,low,high,t,row,col,l; 1006 PetscInt shift; 1007 MatScalar *vp; 1008 1009 PetscFunctionBegin; 1010 for (k=0; k<m; k++) { /* loop over requested rows */ 1011 row = im[k]; 1012 if (row<0) continue; 1013 if (PetscUnlikelyDebug(row >= A->rmap->n)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1); 1014 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1015 cp = a->colidx+shift; /* pointer to the row */ 1016 vp = a->val+shift; /* pointer to the row */ 1017 for (l=0; l<n; l++) { /* loop over requested columns */ 1018 col = in[l]; 1019 if (col<0) continue; 1020 if (PetscUnlikelyDebug(col >= A->cmap->n)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %D max %D",col,A->cmap->n-1); 1021 high = a->rlen[row]; low = 0; /* assume unsorted */ 1022 while (high-low > 5) { 1023 t = (low+high)/2; 1024 if (*(cp+t*8) > col) high = t; 1025 else low = t; 1026 } 1027 for (i=low; i<high; i++) { 1028 if (*(cp+8*i) > col) break; 1029 if (*(cp+8*i) == col) { 1030 *v++ = *(vp+8*i); 1031 goto finished; 1032 } 1033 } 1034 *v++ = 0.0; 1035 finished:; 1036 } 1037 } 1038 PetscFunctionReturn(0); 1039 } 1040 1041 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer) 1042 { 1043 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1044 PetscInt i,j,m=A->rmap->n,shift; 1045 const char *name; 1046 PetscViewerFormat format; 1047 PetscErrorCode ierr; 1048 1049 PetscFunctionBegin; 1050 ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr); 1051 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1052 PetscInt nofinalvalue = 0; 1053 /* 1054 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1055 nofinalvalue = 1; 1056 } 1057 */ 1058 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr); 1059 ierr = PetscViewerASCIIPrintf(viewer,"%% Size = %D %D \n",m,A->cmap->n);CHKERRQ(ierr); 1060 ierr = PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %D \n",a->nz);CHKERRQ(ierr); 1061 #if defined(PETSC_USE_COMPLEX) 1062 ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,4);\n",a->nz+nofinalvalue);CHKERRQ(ierr); 1063 #else 1064 ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,3);\n",a->nz+nofinalvalue);CHKERRQ(ierr); 1065 #endif 1066 ierr = PetscViewerASCIIPrintf(viewer,"zzz = [\n");CHKERRQ(ierr); 1067 1068 for (i=0; i<m; i++) { 1069 shift = a->sliidx[i>>3]+(i&0x07); 1070 for (j=0; j<a->rlen[i]; j++) { 1071 #if defined(PETSC_USE_COMPLEX) 1072 ierr = PetscViewerASCIIPrintf(viewer,"%D %D %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr); 1073 #else 1074 ierr = PetscViewerASCIIPrintf(viewer,"%D %D %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]);CHKERRQ(ierr); 1075 #endif 1076 } 1077 } 1078 /* 1079 if (nofinalvalue) { 1080 #if defined(PETSC_USE_COMPLEX) 1081 ierr = PetscViewerASCIIPrintf(viewer,"%D %D %18.16e %18.16e\n",m,A->cmap->n,0.,0.);CHKERRQ(ierr); 1082 #else 1083 ierr = PetscViewerASCIIPrintf(viewer,"%D %D %18.16e\n",m,A->cmap->n,0.0);CHKERRQ(ierr); 1084 #endif 1085 } 1086 */ 1087 ierr = PetscObjectGetName((PetscObject)A,&name);CHKERRQ(ierr); 1088 ierr = PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name);CHKERRQ(ierr); 1089 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr); 1090 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1091 PetscFunctionReturn(0); 1092 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1093 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr); 1094 for (i=0; i<m; i++) { 1095 ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr); 1096 shift = a->sliidx[i>>3]+(i&0x07); 1097 for (j=0; j<a->rlen[i]; j++) { 1098 #if defined(PETSC_USE_COMPLEX) 1099 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1100 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr); 1101 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1102 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr); 1103 } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) { 1104 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr); 1105 } 1106 #else 1107 if (a->val[shift+8*j] != 0.0) {ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);} 1108 #endif 1109 } 1110 ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr); 1111 } 1112 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr); 1113 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1114 PetscInt cnt=0,jcnt; 1115 PetscScalar value; 1116 #if defined(PETSC_USE_COMPLEX) 1117 PetscBool realonly=PETSC_TRUE; 1118 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1119 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1120 realonly = PETSC_FALSE; 1121 break; 1122 } 1123 } 1124 #endif 1125 1126 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr); 1127 for (i=0; i<m; i++) { 1128 jcnt = 0; 1129 shift = a->sliidx[i>>3]+(i&0x07); 1130 for (j=0; j<A->cmap->n; j++) { 1131 if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) { 1132 value = a->val[cnt++]; 1133 jcnt++; 1134 } else { 1135 value = 0.0; 1136 } 1137 #if defined(PETSC_USE_COMPLEX) 1138 if (realonly) { 1139 ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value));CHKERRQ(ierr); 1140 } else { 1141 ierr = PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value));CHKERRQ(ierr); 1142 } 1143 #else 1144 ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value);CHKERRQ(ierr); 1145 #endif 1146 } 1147 ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr); 1148 } 1149 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr); 1150 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1151 PetscInt fshift=1; 1152 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr); 1153 #if defined(PETSC_USE_COMPLEX) 1154 ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n");CHKERRQ(ierr); 1155 #else 1156 ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n");CHKERRQ(ierr); 1157 #endif 1158 ierr = PetscViewerASCIIPrintf(viewer,"%D %D %D\n", m, A->cmap->n, a->nz);CHKERRQ(ierr); 1159 for (i=0; i<m; i++) { 1160 shift = a->sliidx[i>>3]+(i&0x07); 1161 for (j=0; j<a->rlen[i]; j++) { 1162 #if defined(PETSC_USE_COMPLEX) 1163 ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr); 1164 #else 1165 ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]);CHKERRQ(ierr); 1166 #endif 1167 } 1168 } 1169 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr); 1170 } else if (format == PETSC_VIEWER_NATIVE) { 1171 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 1172 PetscInt row; 1173 ierr = PetscViewerASCIIPrintf(viewer,"slice %D: %D %D\n",i,a->sliidx[i],a->sliidx[i+1]);CHKERRQ(ierr); 1174 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 1175 #if defined(PETSC_USE_COMPLEX) 1176 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1177 ierr = PetscViewerASCIIPrintf(viewer," %D %D %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr); 1178 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1179 ierr = PetscViewerASCIIPrintf(viewer," %D %D %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr); 1180 } else { 1181 ierr = PetscViewerASCIIPrintf(viewer," %D %D %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr); 1182 } 1183 #else 1184 ierr = PetscViewerASCIIPrintf(viewer," %D %D %g\n",8*i+row,a->colidx[j],(double)a->val[j]);CHKERRQ(ierr); 1185 #endif 1186 } 1187 } 1188 } else { 1189 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr); 1190 if (A->factortype) { 1191 for (i=0; i<m; i++) { 1192 shift = a->sliidx[i>>3]+(i&0x07); 1193 ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr); 1194 /* L part */ 1195 for (j=shift; j<a->diag[i]; j+=8) { 1196 #if defined(PETSC_USE_COMPLEX) 1197 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) { 1198 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr); 1199 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) { 1200 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr); 1201 } else { 1202 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr); 1203 } 1204 #else 1205 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr); 1206 #endif 1207 } 1208 /* diagonal */ 1209 j = a->diag[i]; 1210 #if defined(PETSC_USE_COMPLEX) 1211 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1212 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]));CHKERRQ(ierr); 1213 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1214 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])));CHKERRQ(ierr); 1215 } else { 1216 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]));CHKERRQ(ierr); 1217 } 1218 #else 1219 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)(1.0/a->val[j]));CHKERRQ(ierr); 1220 #endif 1221 1222 /* U part */ 1223 for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) { 1224 #if defined(PETSC_USE_COMPLEX) 1225 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1226 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr); 1227 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1228 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr); 1229 } else { 1230 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr); 1231 } 1232 #else 1233 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr); 1234 #endif 1235 } 1236 ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr); 1237 } 1238 } else { 1239 for (i=0; i<m; i++) { 1240 shift = a->sliidx[i>>3]+(i&0x07); 1241 ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr); 1242 for (j=0; j<a->rlen[i]; j++) { 1243 #if defined(PETSC_USE_COMPLEX) 1244 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1245 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr); 1246 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1247 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr); 1248 } else { 1249 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr); 1250 } 1251 #else 1252 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr); 1253 #endif 1254 } 1255 ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr); 1256 } 1257 } 1258 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr); 1259 } 1260 ierr = PetscViewerFlush(viewer);CHKERRQ(ierr); 1261 PetscFunctionReturn(0); 1262 } 1263 1264 #include <petscdraw.h> 1265 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa) 1266 { 1267 Mat A=(Mat)Aa; 1268 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1269 PetscInt i,j,m=A->rmap->n,shift; 1270 int color; 1271 PetscReal xl,yl,xr,yr,x_l,x_r,y_l,y_r; 1272 PetscViewer viewer; 1273 PetscViewerFormat format; 1274 PetscErrorCode ierr; 1275 1276 PetscFunctionBegin; 1277 ierr = PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);CHKERRQ(ierr); 1278 ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr); 1279 ierr = PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);CHKERRQ(ierr); 1280 1281 /* loop over matrix elements drawing boxes */ 1282 1283 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1284 ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr); 1285 /* Blue for negative, Cyan for zero and Red for positive */ 1286 color = PETSC_DRAW_BLUE; 1287 for (i=0; i<m; i++) { 1288 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1289 y_l = m - i - 1.0; y_r = y_l + 1.0; 1290 for (j=0; j<a->rlen[i]; j++) { 1291 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1292 if (PetscRealPart(a->val[shift+8*j]) >= 0.) continue; 1293 ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr); 1294 } 1295 } 1296 color = PETSC_DRAW_CYAN; 1297 for (i=0; i<m; i++) { 1298 shift = a->sliidx[i>>3]+(i&0x07); 1299 y_l = m - i - 1.0; y_r = y_l + 1.0; 1300 for (j=0; j<a->rlen[i]; j++) { 1301 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1302 if (a->val[shift+8*j] != 0.) continue; 1303 ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr); 1304 } 1305 } 1306 color = PETSC_DRAW_RED; 1307 for (i=0; i<m; i++) { 1308 shift = a->sliidx[i>>3]+(i&0x07); 1309 y_l = m - i - 1.0; y_r = y_l + 1.0; 1310 for (j=0; j<a->rlen[i]; j++) { 1311 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1312 if (PetscRealPart(a->val[shift+8*j]) <= 0.) continue; 1313 ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr); 1314 } 1315 } 1316 ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr); 1317 } else { 1318 /* use contour shading to indicate magnitude of values */ 1319 /* first determine max of all nonzero values */ 1320 PetscReal minv=0.0,maxv=0.0; 1321 PetscInt count=0; 1322 PetscDraw popup; 1323 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1324 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1325 } 1326 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1327 ierr = PetscDrawGetPopup(draw,&popup);CHKERRQ(ierr); 1328 ierr = PetscDrawScalePopup(popup,minv,maxv);CHKERRQ(ierr); 1329 1330 ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr); 1331 for (i=0; i<m; i++) { 1332 shift = a->sliidx[i>>3]+(i&0x07); 1333 y_l = m - i - 1.0; 1334 y_r = y_l + 1.0; 1335 for (j=0; j<a->rlen[i]; j++) { 1336 x_l = a->colidx[shift+j*8]; 1337 x_r = x_l + 1.0; 1338 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv); 1339 ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr); 1340 count++; 1341 } 1342 } 1343 ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr); 1344 } 1345 PetscFunctionReturn(0); 1346 } 1347 1348 #include <petscdraw.h> 1349 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer) 1350 { 1351 PetscDraw draw; 1352 PetscReal xr,yr,xl,yl,h,w; 1353 PetscBool isnull; 1354 PetscErrorCode ierr; 1355 1356 PetscFunctionBegin; 1357 ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr); 1358 ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr); 1359 if (isnull) PetscFunctionReturn(0); 1360 1361 xr = A->cmap->n; yr = A->rmap->n; h = yr/10.0; w = xr/10.0; 1362 xr += w; yr += h; xl = -w; yl = -h; 1363 ierr = PetscDrawSetCoordinates(draw,xl,yl,xr,yr);CHKERRQ(ierr); 1364 ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);CHKERRQ(ierr); 1365 ierr = PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A);CHKERRQ(ierr); 1366 ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);CHKERRQ(ierr); 1367 ierr = PetscDrawSave(draw);CHKERRQ(ierr); 1368 PetscFunctionReturn(0); 1369 } 1370 1371 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer) 1372 { 1373 PetscBool iascii,isbinary,isdraw; 1374 PetscErrorCode ierr; 1375 1376 PetscFunctionBegin; 1377 ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr); 1378 ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);CHKERRQ(ierr); 1379 ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);CHKERRQ(ierr); 1380 if (iascii) { 1381 ierr = MatView_SeqSELL_ASCII(A,viewer);CHKERRQ(ierr); 1382 } else if (isbinary) { 1383 /* ierr = MatView_SeqSELL_Binary(A,viewer);CHKERRQ(ierr); */ 1384 } else if (isdraw) { 1385 ierr = MatView_SeqSELL_Draw(A,viewer);CHKERRQ(ierr); 1386 } 1387 PetscFunctionReturn(0); 1388 } 1389 1390 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode) 1391 { 1392 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1393 PetscInt i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k; 1394 MatScalar *vp; 1395 PetscErrorCode ierr; 1396 1397 PetscFunctionBegin; 1398 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1399 /* To do: compress out the unused elements */ 1400 ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr); 1401 ierr = PetscInfo6(A,"Matrix size: %D X %D; storage space: %D allocated %D used (%D nonzeros+%D paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz);CHKERRQ(ierr); 1402 ierr = PetscInfo1(A,"Number of mallocs during MatSetValues() is %D\n",a->reallocs);CHKERRQ(ierr); 1403 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rlenmax);CHKERRQ(ierr); 1404 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1405 for (i=0; i<a->totalslices; ++i) { 1406 shift = a->sliidx[i]; /* starting index of the slice */ 1407 cp = a->colidx+shift; /* pointer to the column indices of the slice */ 1408 vp = a->val+shift; /* pointer to the nonzero values of the slice */ 1409 for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */ 1410 row = 8*i + row_in_slice; 1411 nrow = a->rlen[row]; /* number of nonzeros in row */ 1412 /* 1413 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1414 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1415 */ 1416 lastcol = 0; 1417 if (nrow>0) { /* nonempty row */ 1418 lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */ 1419 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1420 for (j=1;j<8;j++) { 1421 if (a->rlen[8*i+j]) { 1422 lastcol = cp[j]; 1423 break; 1424 } 1425 } 1426 } else { 1427 if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */ 1428 } 1429 1430 for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) { 1431 cp[8*k+row_in_slice] = lastcol; 1432 vp[8*k+row_in_slice] = (MatScalar)0; 1433 } 1434 } 1435 } 1436 1437 A->info.mallocs += a->reallocs; 1438 a->reallocs = 0; 1439 1440 ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr); 1441 PetscFunctionReturn(0); 1442 } 1443 1444 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info) 1445 { 1446 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1447 1448 PetscFunctionBegin; 1449 info->block_size = 1.0; 1450 info->nz_allocated = a->maxallocmat; 1451 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1452 info->nz_unneeded = (a->maxallocmat-a->sliidx[a->totalslices]); 1453 info->assemblies = A->num_ass; 1454 info->mallocs = A->info.mallocs; 1455 info->memory = ((PetscObject)A)->mem; 1456 if (A->factortype) { 1457 info->fill_ratio_given = A->info.fill_ratio_given; 1458 info->fill_ratio_needed = A->info.fill_ratio_needed; 1459 info->factor_mallocs = A->info.factor_mallocs; 1460 } else { 1461 info->fill_ratio_given = 0; 1462 info->fill_ratio_needed = 0; 1463 info->factor_mallocs = 0; 1464 } 1465 PetscFunctionReturn(0); 1466 } 1467 1468 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 1469 { 1470 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1471 PetscInt shift,i,k,l,low,high,t,ii,row,col,nrow; 1472 PetscInt *cp,nonew=a->nonew,lastcol=-1; 1473 MatScalar *vp,value; 1474 PetscErrorCode ierr; 1475 1476 PetscFunctionBegin; 1477 for (k=0; k<m; k++) { /* loop over added rows */ 1478 row = im[k]; 1479 if (row < 0) continue; 1480 if (PetscUnlikelyDebug(row >= A->rmap->n)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1); 1481 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1482 cp = a->colidx+shift; /* pointer to the row */ 1483 vp = a->val+shift; /* pointer to the row */ 1484 nrow = a->rlen[row]; 1485 low = 0; 1486 high = nrow; 1487 1488 for (l=0; l<n; l++) { /* loop over added columns */ 1489 col = in[l]; 1490 if (col<0) continue; 1491 if (PetscUnlikelyDebug(col >= A->cmap->n)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %D max %D",col,A->cmap->n-1); 1492 if (a->roworiented) { 1493 value = v[l+k*n]; 1494 } else { 1495 value = v[k+l*m]; 1496 } 1497 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1498 1499 /* search in this row for the specified colmun, i indicates the column to be set */ 1500 if (col <= lastcol) low = 0; 1501 else high = nrow; 1502 lastcol = col; 1503 while (high-low > 5) { 1504 t = (low+high)/2; 1505 if (*(cp+t*8) > col) high = t; 1506 else low = t; 1507 } 1508 for (i=low; i<high; i++) { 1509 if (*(cp+i*8) > col) break; 1510 if (*(cp+i*8) == col) { 1511 if (is == ADD_VALUES) *(vp+i*8) += value; 1512 else *(vp+i*8) = value; 1513 low = i + 1; 1514 goto noinsert; 1515 } 1516 } 1517 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1518 if (nonew == 1) goto noinsert; 1519 if (nonew == -1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%D, %D) in the matrix", row, col); 1520 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1521 MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar); 1522 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1523 for (ii=nrow-1; ii>=i; ii--) { 1524 *(cp+(ii+1)*8) = *(cp+ii*8); 1525 *(vp+(ii+1)*8) = *(vp+ii*8); 1526 } 1527 a->rlen[row]++; 1528 *(cp+i*8) = col; 1529 *(vp+i*8) = value; 1530 a->nz++; 1531 A->nonzerostate++; 1532 low = i+1; high++; nrow++; 1533 noinsert:; 1534 } 1535 a->rlen[row] = nrow; 1536 } 1537 PetscFunctionReturn(0); 1538 } 1539 1540 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str) 1541 { 1542 PetscErrorCode ierr; 1543 1544 PetscFunctionBegin; 1545 /* If the two matrices have the same copy implementation, use fast copy. */ 1546 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1547 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1548 Mat_SeqSELL *b=(Mat_SeqSELL*)B->data; 1549 1550 if (a->sliidx[a->totalslices] != b->sliidx[b->totalslices]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different"); 1551 ierr = PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr); 1552 } else { 1553 ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr); 1554 } 1555 PetscFunctionReturn(0); 1556 } 1557 1558 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1559 { 1560 PetscErrorCode ierr; 1561 1562 PetscFunctionBegin; 1563 ierr = MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,0);CHKERRQ(ierr); 1564 PetscFunctionReturn(0); 1565 } 1566 1567 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[]) 1568 { 1569 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1570 1571 PetscFunctionBegin; 1572 *array = a->val; 1573 PetscFunctionReturn(0); 1574 } 1575 1576 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[]) 1577 { 1578 PetscFunctionBegin; 1579 PetscFunctionReturn(0); 1580 } 1581 1582 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1583 { 1584 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1585 PetscInt i; 1586 MatScalar *aval=a->val; 1587 1588 PetscFunctionBegin; 1589 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]); 1590 PetscFunctionReturn(0); 1591 } 1592 1593 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1594 { 1595 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1596 PetscInt i; 1597 MatScalar *aval=a->val; 1598 PetscErrorCode ierr; 1599 1600 PetscFunctionBegin; 1601 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1602 ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr); 1603 PetscFunctionReturn(0); 1604 } 1605 1606 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha) 1607 { 1608 Mat_SeqSELL *a=(Mat_SeqSELL*)inA->data; 1609 MatScalar *aval=a->val; 1610 PetscScalar oalpha=alpha; 1611 PetscBLASInt one=1,size; 1612 PetscErrorCode ierr; 1613 1614 PetscFunctionBegin; 1615 ierr = PetscBLASIntCast(a->sliidx[a->totalslices],&size);CHKERRQ(ierr); 1616 PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one)); 1617 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1618 ierr = MatSeqSELLInvalidateDiagonal(inA);CHKERRQ(ierr); 1619 PetscFunctionReturn(0); 1620 } 1621 1622 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a) 1623 { 1624 Mat_SeqSELL *y=(Mat_SeqSELL*)Y->data; 1625 PetscErrorCode ierr; 1626 1627 PetscFunctionBegin; 1628 if (!Y->preallocated || !y->nz) { 1629 ierr = MatSeqSELLSetPreallocation(Y,1,NULL);CHKERRQ(ierr); 1630 } 1631 ierr = MatShift_Basic(Y,a);CHKERRQ(ierr); 1632 PetscFunctionReturn(0); 1633 } 1634 1635 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1636 { 1637 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1638 PetscScalar *x,sum,*t; 1639 const MatScalar *idiag=0,*mdiag; 1640 const PetscScalar *b,*xb; 1641 PetscInt n,m=A->rmap->n,i,j,shift; 1642 const PetscInt *diag; 1643 PetscErrorCode ierr; 1644 1645 PetscFunctionBegin; 1646 its = its*lits; 1647 1648 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1649 if (!a->idiagvalid) {ierr = MatInvertDiagonal_SeqSELL(A,omega,fshift);CHKERRQ(ierr);} 1650 a->fshift = fshift; 1651 a->omega = omega; 1652 1653 diag = a->diag; 1654 t = a->ssor_work; 1655 idiag = a->idiag; 1656 mdiag = a->mdiag; 1657 1658 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1659 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1660 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1661 if (flag == SOR_APPLY_UPPER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented"); 1662 if (flag == SOR_APPLY_LOWER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented"); 1663 if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat"); 1664 1665 if (flag & SOR_ZERO_INITIAL_GUESS) { 1666 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1667 for (i=0; i<m; i++) { 1668 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1669 sum = b[i]; 1670 n = (diag[i]-shift)/8; 1671 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1672 t[i] = sum; 1673 x[i] = sum*idiag[i]; 1674 } 1675 xb = t; 1676 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1677 } else xb = b; 1678 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1679 for (i=m-1; i>=0; i--) { 1680 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1681 sum = xb[i]; 1682 n = a->rlen[i]-(diag[i]-shift)/8-1; 1683 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1684 if (xb == b) { 1685 x[i] = sum*idiag[i]; 1686 } else { 1687 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1688 } 1689 } 1690 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */ 1691 } 1692 its--; 1693 } 1694 while (its--) { 1695 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1696 for (i=0; i<m; i++) { 1697 /* lower */ 1698 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1699 sum = b[i]; 1700 n = (diag[i]-shift)/8; 1701 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1702 t[i] = sum; /* save application of the lower-triangular part */ 1703 /* upper */ 1704 n = a->rlen[i]-(diag[i]-shift)/8-1; 1705 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1706 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1707 } 1708 xb = t; 1709 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1710 } else xb = b; 1711 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1712 for (i=m-1; i>=0; i--) { 1713 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1714 sum = xb[i]; 1715 if (xb == b) { 1716 /* whole matrix (no checkpointing available) */ 1717 n = a->rlen[i]; 1718 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1719 x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i]; 1720 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1721 n = a->rlen[i]-(diag[i]-shift)/8-1; 1722 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1723 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1724 } 1725 } 1726 if (xb == b) { 1727 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1728 } else { 1729 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */ 1730 } 1731 } 1732 } 1733 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1734 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1735 PetscFunctionReturn(0); 1736 } 1737 1738 /* -------------------------------------------------------------------*/ 1739 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1740 MatGetRow_SeqSELL, 1741 MatRestoreRow_SeqSELL, 1742 MatMult_SeqSELL, 1743 /* 4*/ MatMultAdd_SeqSELL, 1744 MatMultTranspose_SeqSELL, 1745 MatMultTransposeAdd_SeqSELL, 1746 0, 1747 0, 1748 0, 1749 /* 10*/ 0, 1750 0, 1751 0, 1752 MatSOR_SeqSELL, 1753 0, 1754 /* 15*/ MatGetInfo_SeqSELL, 1755 MatEqual_SeqSELL, 1756 MatGetDiagonal_SeqSELL, 1757 MatDiagonalScale_SeqSELL, 1758 0, 1759 /* 20*/ 0, 1760 MatAssemblyEnd_SeqSELL, 1761 MatSetOption_SeqSELL, 1762 MatZeroEntries_SeqSELL, 1763 /* 24*/ 0, 1764 0, 1765 0, 1766 0, 1767 0, 1768 /* 29*/ MatSetUp_SeqSELL, 1769 0, 1770 0, 1771 0, 1772 0, 1773 /* 34*/ MatDuplicate_SeqSELL, 1774 0, 1775 0, 1776 0, 1777 0, 1778 /* 39*/ 0, 1779 0, 1780 0, 1781 MatGetValues_SeqSELL, 1782 MatCopy_SeqSELL, 1783 /* 44*/ 0, 1784 MatScale_SeqSELL, 1785 MatShift_SeqSELL, 1786 0, 1787 0, 1788 /* 49*/ 0, 1789 0, 1790 0, 1791 0, 1792 0, 1793 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1794 0, 1795 0, 1796 0, 1797 0, 1798 /* 59*/ 0, 1799 MatDestroy_SeqSELL, 1800 MatView_SeqSELL, 1801 0, 1802 0, 1803 /* 64*/ 0, 1804 0, 1805 0, 1806 0, 1807 0, 1808 /* 69*/ 0, 1809 0, 1810 0, 1811 0, 1812 0, 1813 /* 74*/ 0, 1814 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1815 0, 1816 0, 1817 0, 1818 /* 79*/ 0, 1819 0, 1820 0, 1821 0, 1822 0, 1823 /* 84*/ 0, 1824 0, 1825 0, 1826 0, 1827 0, 1828 /* 89*/ 0, 1829 0, 1830 0, 1831 0, 1832 0, 1833 /* 94*/ 0, 1834 0, 1835 0, 1836 0, 1837 0, 1838 /* 99*/ 0, 1839 0, 1840 0, 1841 MatConjugate_SeqSELL, 1842 0, 1843 /*104*/ 0, 1844 0, 1845 0, 1846 0, 1847 0, 1848 /*109*/ 0, 1849 0, 1850 0, 1851 0, 1852 MatMissingDiagonal_SeqSELL, 1853 /*114*/ 0, 1854 0, 1855 0, 1856 0, 1857 0, 1858 /*119*/ 0, 1859 0, 1860 0, 1861 0, 1862 0, 1863 /*124*/ 0, 1864 0, 1865 0, 1866 0, 1867 0, 1868 /*129*/ 0, 1869 0, 1870 0, 1871 0, 1872 0, 1873 /*134*/ 0, 1874 0, 1875 0, 1876 0, 1877 0, 1878 /*139*/ 0, 1879 0, 1880 0, 1881 MatFDColoringSetUp_SeqXAIJ, 1882 0, 1883 /*144*/0 1884 }; 1885 1886 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1887 { 1888 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1889 PetscErrorCode ierr; 1890 1891 PetscFunctionBegin; 1892 if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1893 1894 /* allocate space for values if not already there */ 1895 if (!a->saved_values) { 1896 ierr = PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values);CHKERRQ(ierr); 1897 ierr = PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1898 } 1899 1900 /* copy values over */ 1901 ierr = PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr); 1902 PetscFunctionReturn(0); 1903 } 1904 1905 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1906 { 1907 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1908 PetscErrorCode ierr; 1909 1910 PetscFunctionBegin; 1911 if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1912 if (!a->saved_values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first"); 1913 ierr = PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices]);CHKERRQ(ierr); 1914 PetscFunctionReturn(0); 1915 } 1916 1917 /*@C 1918 MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray() 1919 1920 Not Collective 1921 1922 Input Parameters: 1923 . mat - a MATSEQSELL matrix 1924 . array - pointer to the data 1925 1926 Level: intermediate 1927 1928 .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90() 1929 @*/ 1930 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array) 1931 { 1932 PetscErrorCode ierr; 1933 1934 PetscFunctionBegin; 1935 ierr = PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));CHKERRQ(ierr); 1936 PetscFunctionReturn(0); 1937 } 1938 1939 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 1940 { 1941 Mat_SeqSELL *b; 1942 PetscMPIInt size; 1943 PetscErrorCode ierr; 1944 1945 PetscFunctionBegin; 1946 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr); 1947 if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1"); 1948 1949 ierr = PetscNewLog(B,&b);CHKERRQ(ierr); 1950 1951 B->data = (void*)b; 1952 1953 ierr = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr); 1954 1955 b->row = 0; 1956 b->col = 0; 1957 b->icol = 0; 1958 b->reallocs = 0; 1959 b->ignorezeroentries = PETSC_FALSE; 1960 b->roworiented = PETSC_TRUE; 1961 b->nonew = 0; 1962 b->diag = 0; 1963 b->solve_work = 0; 1964 B->spptr = 0; 1965 b->saved_values = 0; 1966 b->idiag = 0; 1967 b->mdiag = 0; 1968 b->ssor_work = 0; 1969 b->omega = 1.0; 1970 b->fshift = 0.0; 1971 b->idiagvalid = PETSC_FALSE; 1972 b->keepnonzeropattern = PETSC_FALSE; 1973 1974 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL);CHKERRQ(ierr); 1975 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL);CHKERRQ(ierr); 1976 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL);CHKERRQ(ierr); 1977 ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL);CHKERRQ(ierr); 1978 ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL);CHKERRQ(ierr); 1979 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL);CHKERRQ(ierr); 1980 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ);CHKERRQ(ierr); 1981 PetscFunctionReturn(0); 1982 } 1983 1984 /* 1985 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1986 */ 1987 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace) 1988 { 1989 Mat_SeqSELL *c,*a=(Mat_SeqSELL*)A->data; 1990 PetscInt i,m=A->rmap->n; 1991 PetscInt totalslices=a->totalslices; 1992 PetscErrorCode ierr; 1993 1994 PetscFunctionBegin; 1995 c = (Mat_SeqSELL*)C->data; 1996 1997 C->factortype = A->factortype; 1998 c->row = 0; 1999 c->col = 0; 2000 c->icol = 0; 2001 c->reallocs = 0; 2002 2003 C->assembled = PETSC_TRUE; 2004 2005 ierr = PetscLayoutReference(A->rmap,&C->rmap);CHKERRQ(ierr); 2006 ierr = PetscLayoutReference(A->cmap,&C->cmap);CHKERRQ(ierr); 2007 2008 ierr = PetscMalloc1(8*totalslices,&c->rlen);CHKERRQ(ierr); 2009 ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr); 2010 ierr = PetscMalloc1(totalslices+1,&c->sliidx);CHKERRQ(ierr); 2011 ierr = PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr); 2012 2013 for (i=0; i<m; i++) c->rlen[i] = a->rlen[i]; 2014 for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i]; 2015 2016 /* allocate the matrix space */ 2017 if (mallocmatspace) { 2018 ierr = PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx);CHKERRQ(ierr); 2019 ierr = PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr); 2020 2021 c->singlemalloc = PETSC_TRUE; 2022 2023 if (m > 0) { 2024 ierr = PetscArraycpy(c->colidx,a->colidx,a->maxallocmat);CHKERRQ(ierr); 2025 if (cpvalues == MAT_COPY_VALUES) { 2026 ierr = PetscArraycpy(c->val,a->val,a->maxallocmat);CHKERRQ(ierr); 2027 } else { 2028 ierr = PetscArrayzero(c->val,a->maxallocmat);CHKERRQ(ierr); 2029 } 2030 } 2031 } 2032 2033 c->ignorezeroentries = a->ignorezeroentries; 2034 c->roworiented = a->roworiented; 2035 c->nonew = a->nonew; 2036 if (a->diag) { 2037 ierr = PetscMalloc1(m,&c->diag);CHKERRQ(ierr); 2038 ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr); 2039 for (i=0; i<m; i++) { 2040 c->diag[i] = a->diag[i]; 2041 } 2042 } else c->diag = 0; 2043 2044 c->solve_work = 0; 2045 c->saved_values = 0; 2046 c->idiag = 0; 2047 c->ssor_work = 0; 2048 c->keepnonzeropattern = a->keepnonzeropattern; 2049 c->free_val = PETSC_TRUE; 2050 c->free_colidx = PETSC_TRUE; 2051 2052 c->maxallocmat = a->maxallocmat; 2053 c->maxallocrow = a->maxallocrow; 2054 c->rlenmax = a->rlenmax; 2055 c->nz = a->nz; 2056 C->preallocated = PETSC_TRUE; 2057 2058 c->nonzerorowcnt = a->nonzerorowcnt; 2059 C->nonzerostate = A->nonzerostate; 2060 2061 ierr = PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);CHKERRQ(ierr); 2062 PetscFunctionReturn(0); 2063 } 2064 2065 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B) 2066 { 2067 PetscErrorCode ierr; 2068 2069 PetscFunctionBegin; 2070 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 2071 ierr = MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);CHKERRQ(ierr); 2072 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) { 2073 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 2074 } 2075 ierr = MatSetType(*B,((PetscObject)A)->type_name);CHKERRQ(ierr); 2076 ierr = MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE);CHKERRQ(ierr); 2077 PetscFunctionReturn(0); 2078 } 2079 2080 /*@C 2081 MatCreateSeqSELL - Creates a sparse matrix in SELL format. 2082 2083 Collective 2084 2085 Input Parameters: 2086 + comm - MPI communicator, set to PETSC_COMM_SELF 2087 . m - number of rows 2088 . n - number of columns 2089 . rlenmax - maximum number of nonzeros in a row 2090 - rlen - array containing the number of nonzeros in the various rows 2091 (possibly different for each row) or NULL 2092 2093 Output Parameter: 2094 . A - the matrix 2095 2096 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2097 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2098 [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation] 2099 2100 Notes: 2101 If nnz is given then nz is ignored 2102 2103 Specify the preallocated storage with either rlenmax or rlen (not both). 2104 Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory 2105 allocation. For large problems you MUST preallocate memory or you 2106 will get TERRIBLE performance, see the users' manual chapter on matrices. 2107 2108 Level: intermediate 2109 2110 .seealso: MatCreate(), MatCreateSELL(), MatSetValues() 2111 2112 @*/ 2113 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A) 2114 { 2115 PetscErrorCode ierr; 2116 2117 PetscFunctionBegin; 2118 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2119 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2120 ierr = MatSetType(*A,MATSEQSELL);CHKERRQ(ierr); 2121 ierr = MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen);CHKERRQ(ierr); 2122 PetscFunctionReturn(0); 2123 } 2124 2125 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg) 2126 { 2127 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data; 2128 PetscInt totalslices=a->totalslices; 2129 PetscErrorCode ierr; 2130 2131 PetscFunctionBegin; 2132 /* If the matrix dimensions are not equal,or no of nonzeros */ 2133 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2134 *flg = PETSC_FALSE; 2135 PetscFunctionReturn(0); 2136 } 2137 /* if the a->colidx are the same */ 2138 ierr = PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg);CHKERRQ(ierr); 2139 if (!*flg) PetscFunctionReturn(0); 2140 /* if a->val are the same */ 2141 ierr = PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg);CHKERRQ(ierr); 2142 PetscFunctionReturn(0); 2143 } 2144 2145 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2146 { 2147 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2148 2149 PetscFunctionBegin; 2150 a->idiagvalid = PETSC_FALSE; 2151 PetscFunctionReturn(0); 2152 } 2153 2154 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2155 { 2156 #if defined(PETSC_USE_COMPLEX) 2157 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2158 PetscInt i; 2159 PetscScalar *val = a->val; 2160 2161 PetscFunctionBegin; 2162 for (i=0; i<a->sliidx[a->totalslices]; i++) { 2163 val[i] = PetscConj(val[i]); 2164 } 2165 #else 2166 PetscFunctionBegin; 2167 #endif 2168 PetscFunctionReturn(0); 2169 } 2170