1 2 /* 3 Defines the basic matrix operations for the SELL matrix storage format. 4 */ 5 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 6 #include <petscblaslapack.h> 7 #include <petsc/private/kernels/blocktranspose.h> 8 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 9 10 #include <immintrin.h> 11 12 #if !defined(_MM_SCALE_8) 13 #define _MM_SCALE_8 8 14 #endif 15 16 #if defined(__AVX512F__) 17 /* these do not work 18 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 19 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 20 */ 21 #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 22 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 23 vec_idx = _mm256_loadu_si256((__m256i const*)acolidx); \ 24 vec_vals = _mm512_loadu_pd(aval); \ 25 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \ 26 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y) 27 #elif defined(__AVX2__) && defined(__FMA__) 28 #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \ 29 vec_vals = _mm256_loadu_pd(aval); \ 30 vec_idx = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \ 31 vec_x = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \ 32 vec_y = _mm256_fmadd_pd(vec_x,vec_vals,vec_y) 33 #endif 34 #endif /* PETSC_HAVE_IMMINTRIN_H */ 35 36 /*@C 37 MatSeqSELLSetPreallocation - For good matrix assembly performance 38 the user should preallocate the matrix storage by setting the parameter nz 39 (or the array nnz). By setting these parameters accurately, performance 40 during matrix assembly can be increased significantly. 41 42 Collective 43 44 Input Parameters: 45 + B - The matrix 46 . nz - number of nonzeros per row (same for all rows) 47 - nnz - array containing the number of nonzeros in the various rows 48 (possibly different for each row) or NULL 49 50 Notes: 51 If nnz is given then nz is ignored. 52 53 Specify the preallocated storage with either nz or nnz (not both). 54 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 55 allocation. For large problems you MUST preallocate memory or you 56 will get TERRIBLE performance, see the users' manual chapter on matrices. 57 58 You can call MatGetInfo() to get information on how effective the preallocation was; 59 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 60 You can also run with the option -info and look for messages with the string 61 malloc in them to see if additional memory allocation was needed. 62 63 Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix 64 entries or columns indices. 65 66 The maximum number of nonzeos in any row should be as accurate as possible. 67 If it is underestimated, you will get bad performance due to reallocation 68 (MatSeqXSELLReallocateSELL). 69 70 Level: intermediate 71 72 .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo() 73 74 @*/ 75 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[]) 76 { 77 PetscErrorCode ierr; 78 79 PetscFunctionBegin; 80 PetscValidHeaderSpecific(B,MAT_CLASSID,1); 81 PetscValidType(B,1); 82 ierr = PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));CHKERRQ(ierr); 83 PetscFunctionReturn(0); 84 } 85 86 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[]) 87 { 88 Mat_SeqSELL *b; 89 PetscInt i,j,totalslices; 90 PetscBool skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE; 91 PetscErrorCode ierr; 92 93 PetscFunctionBegin; 94 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 95 if (maxallocrow == MAT_SKIP_ALLOCATION) { 96 skipallocation = PETSC_TRUE; 97 maxallocrow = 0; 98 } 99 100 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 101 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 102 103 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 104 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 105 if (maxallocrow < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %D",maxallocrow); 106 if (rlen) { 107 for (i=0; i<B->rmap->n; i++) { 108 if (rlen[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %D value %D",i,rlen[i]); 109 if (rlen[i] > B->cmap->n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %D value %D rowlength %D",i,rlen[i],B->cmap->n); 110 } 111 } 112 113 B->preallocated = PETSC_TRUE; 114 115 b = (Mat_SeqSELL*)B->data; 116 117 totalslices = B->rmap->n/8+((B->rmap->n & 0x07)?1:0); /* ceil(n/8) */ 118 b->totalslices = totalslices; 119 if (!skipallocation) { 120 if (B->rmap->n & 0x07) {ierr = PetscInfo1(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %D)\n",B->rmap->n);CHKERRQ(ierr);} 121 122 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 123 ierr = PetscMalloc1(totalslices+1,&b->sliidx);CHKERRQ(ierr); 124 ierr = PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr); 125 } 126 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 127 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 128 else if (maxallocrow < 0) maxallocrow = 1; 129 for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow; 130 } else { 131 maxallocrow = 0; 132 b->sliidx[0] = 0; 133 for (i=1; i<totalslices; i++) { 134 b->sliidx[i] = 0; 135 for (j=0;j<8;j++) { 136 b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]); 137 } 138 maxallocrow = PetscMax(b->sliidx[i],maxallocrow); 139 ierr = PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i]);CHKERRQ(ierr); 140 } 141 /* last slice */ 142 b->sliidx[totalslices] = 0; 143 for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]); 144 maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow); 145 b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices]; 146 } 147 148 /* allocate space for val, colidx, rlen */ 149 /* FIXME: should B's old memory be unlogged? */ 150 ierr = MatSeqXSELLFreeSELL(B,&b->val,&b->colidx);CHKERRQ(ierr); 151 /* FIXME: assuming an element of the bit array takes 8 bits */ 152 ierr = PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx);CHKERRQ(ierr); 153 ierr = PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr); 154 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 155 ierr = PetscCalloc1(8*totalslices,&b->rlen);CHKERRQ(ierr); 156 ierr = PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt));CHKERRQ(ierr); 157 158 b->singlemalloc = PETSC_TRUE; 159 b->free_val = PETSC_TRUE; 160 b->free_colidx = PETSC_TRUE; 161 } else { 162 b->free_val = PETSC_FALSE; 163 b->free_colidx = PETSC_FALSE; 164 } 165 166 b->nz = 0; 167 b->maxallocrow = maxallocrow; 168 b->rlenmax = maxallocrow; 169 b->maxallocmat = b->sliidx[totalslices]; 170 B->info.nz_unneeded = (double)b->maxallocmat; 171 if (realalloc) { 172 ierr = MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 173 } 174 PetscFunctionReturn(0); 175 } 176 177 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 178 { 179 Mat_SeqSELL *a = (Mat_SeqSELL*)A->data; 180 PetscInt shift; 181 182 PetscFunctionBegin; 183 if (row < 0 || row >= A->rmap->n) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %D out of range",row); 184 if (nz) *nz = a->rlen[row]; 185 shift = a->sliidx[row>>3]+(row&0x07); 186 if (!a->getrowcols) { 187 PetscErrorCode ierr; 188 189 ierr = PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals);CHKERRQ(ierr); 190 } 191 if (idx) { 192 PetscInt j; 193 for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j]; 194 *idx = a->getrowcols; 195 } 196 if (v) { 197 PetscInt j; 198 for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j]; 199 *v = a->getrowvals; 200 } 201 PetscFunctionReturn(0); 202 } 203 204 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v) 205 { 206 PetscFunctionBegin; 207 PetscFunctionReturn(0); 208 } 209 210 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat) 211 { 212 Mat B; 213 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 214 PetscInt i; 215 PetscErrorCode ierr; 216 217 PetscFunctionBegin; 218 if (reuse == MAT_REUSE_MATRIX) { 219 B = *newmat; 220 ierr = MatZeroEntries(B);CHKERRQ(ierr); 221 } else { 222 ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr); 223 ierr = MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N);CHKERRQ(ierr); 224 ierr = MatSetType(B,MATSEQAIJ);CHKERRQ(ierr); 225 ierr = MatSeqAIJSetPreallocation(B,0,a->rlen);CHKERRQ(ierr); 226 } 227 228 for (i=0; i<A->rmap->n; i++) { 229 PetscInt nz = 0,*cols = NULL; 230 PetscScalar *vals = NULL; 231 232 ierr = MatGetRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr); 233 ierr = MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES);CHKERRQ(ierr); 234 ierr = MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr); 235 } 236 237 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 238 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 239 B->rmap->bs = A->rmap->bs; 240 241 if (reuse == MAT_INPLACE_MATRIX) { 242 ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr); 243 } else { 244 *newmat = B; 245 } 246 PetscFunctionReturn(0); 247 } 248 249 #include <../src/mat/impls/aij/seq/aij.h> 250 251 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat) 252 { 253 Mat B; 254 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 255 PetscInt *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols; 256 const PetscInt *cols; 257 const PetscScalar *vals; 258 PetscErrorCode ierr; 259 260 PetscFunctionBegin; 261 262 if (reuse == MAT_REUSE_MATRIX) { 263 B = *newmat; 264 } else { 265 /* Can we just use ilen? */ 266 ierr = PetscMalloc1(m,&rowlengths);CHKERRQ(ierr); 267 for (i=0; i<m; i++) { 268 rowlengths[i] = ai[i+1] - ai[i]; 269 } 270 271 ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr); 272 ierr = MatSetSizes(B,m,n,m,n);CHKERRQ(ierr); 273 ierr = MatSetType(B,MATSEQSELL);CHKERRQ(ierr); 274 ierr = MatSeqSELLSetPreallocation(B,0,rowlengths);CHKERRQ(ierr); 275 ierr = PetscFree(rowlengths);CHKERRQ(ierr); 276 } 277 278 for (row=0; row<m; row++) { 279 ierr = MatGetRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr); 280 ierr = MatSetValues(B,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr); 281 ierr = MatRestoreRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr); 282 } 283 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 284 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 285 B->rmap->bs = A->rmap->bs; 286 287 if (reuse == MAT_INPLACE_MATRIX) { 288 ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr); 289 } else { 290 *newmat = B; 291 } 292 PetscFunctionReturn(0); 293 } 294 295 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy) 296 { 297 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 298 PetscScalar *y; 299 const PetscScalar *x; 300 const MatScalar *aval=a->val; 301 PetscInt totalslices=a->totalslices; 302 const PetscInt *acolidx=a->colidx; 303 PetscInt i,j; 304 PetscErrorCode ierr; 305 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 306 __m512d vec_x,vec_y,vec_vals; 307 __m256i vec_idx; 308 __mmask8 mask; 309 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 310 __m256i vec_idx2,vec_idx3,vec_idx4; 311 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 312 __m128i vec_idx; 313 __m256d vec_x,vec_y,vec_y2,vec_vals; 314 MatScalar yval; 315 PetscInt r,rows_left,row,nnz_in_row; 316 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 317 __m128d vec_x_tmp; 318 __m256d vec_x,vec_y,vec_y2,vec_vals; 319 MatScalar yval; 320 PetscInt r,rows_left,row,nnz_in_row; 321 #else 322 PetscScalar sum[8]; 323 #endif 324 325 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 326 #pragma disjoint(*x,*y,*aval) 327 #endif 328 329 PetscFunctionBegin; 330 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 331 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 332 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 333 for (i=0; i<totalslices; i++) { /* loop over slices */ 334 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 335 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 336 337 vec_y = _mm512_setzero_pd(); 338 vec_y2 = _mm512_setzero_pd(); 339 vec_y3 = _mm512_setzero_pd(); 340 vec_y4 = _mm512_setzero_pd(); 341 342 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 343 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 344 case 3: 345 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 346 acolidx += 8; aval += 8; 347 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 348 acolidx += 8; aval += 8; 349 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 350 acolidx += 8; aval += 8; 351 j += 3; 352 break; 353 case 2: 354 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 355 acolidx += 8; aval += 8; 356 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 357 acolidx += 8; aval += 8; 358 j += 2; 359 break; 360 case 1: 361 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 362 acolidx += 8; aval += 8; 363 j += 1; 364 break; 365 } 366 #pragma novector 367 for (; j<(a->sliidx[i+1]>>3); j+=4) { 368 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 369 acolidx += 8; aval += 8; 370 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 371 acolidx += 8; aval += 8; 372 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 373 acolidx += 8; aval += 8; 374 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 375 acolidx += 8; aval += 8; 376 } 377 378 vec_y = _mm512_add_pd(vec_y,vec_y2); 379 vec_y = _mm512_add_pd(vec_y,vec_y3); 380 vec_y = _mm512_add_pd(vec_y,vec_y4); 381 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 382 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 383 _mm512_mask_storeu_pd(&y[8*i],mask,vec_y); 384 } else { 385 _mm512_storeu_pd(&y[8*i],vec_y); 386 } 387 } 388 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 389 for (i=0; i<totalslices; i++) { /* loop over full slices */ 390 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 391 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 392 393 /* last slice may have padding rows. Don't use vectorization. */ 394 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 395 rows_left = A->rmap->n - 8*i; 396 for (r=0; r<rows_left; ++r) { 397 yval = (MatScalar)0; 398 row = 8*i + r; 399 nnz_in_row = a->rlen[row]; 400 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 401 y[row] = yval; 402 } 403 break; 404 } 405 406 vec_y = _mm256_setzero_pd(); 407 vec_y2 = _mm256_setzero_pd(); 408 409 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 410 #pragma novector 411 #pragma unroll(2) 412 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 413 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 414 aval += 4; acolidx += 4; 415 AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2); 416 aval += 4; acolidx += 4; 417 } 418 419 _mm256_storeu_pd(y+i*8,vec_y); 420 _mm256_storeu_pd(y+i*8+4,vec_y2); 421 } 422 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 423 for (i=0; i<totalslices; i++) { /* loop over full slices */ 424 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 425 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 426 427 vec_y = _mm256_setzero_pd(); 428 vec_y2 = _mm256_setzero_pd(); 429 430 /* last slice may have padding rows. Don't use vectorization. */ 431 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 432 rows_left = A->rmap->n - 8*i; 433 for (r=0; r<rows_left; ++r) { 434 yval = (MatScalar)0; 435 row = 8*i + r; 436 nnz_in_row = a->rlen[row]; 437 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]]; 438 y[row] = yval; 439 } 440 break; 441 } 442 443 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 444 #pragma novector 445 #pragma unroll(2) 446 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 447 vec_vals = _mm256_loadu_pd(aval); 448 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 449 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 450 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 451 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 452 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 453 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 454 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 455 aval += 4; 456 457 vec_vals = _mm256_loadu_pd(aval); 458 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 459 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 460 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 461 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 462 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 463 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 464 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 465 aval += 4; 466 } 467 468 _mm256_storeu_pd(y + i*8, vec_y); 469 _mm256_storeu_pd(y + i*8 + 4, vec_y2); 470 } 471 #else 472 for (i=0; i<totalslices; i++) { /* loop over slices */ 473 for (j=0; j<8; j++) sum[j] = 0.0; 474 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 475 sum[0] += aval[j] * x[acolidx[j]]; 476 sum[1] += aval[j+1] * x[acolidx[j+1]]; 477 sum[2] += aval[j+2] * x[acolidx[j+2]]; 478 sum[3] += aval[j+3] * x[acolidx[j+3]]; 479 sum[4] += aval[j+4] * x[acolidx[j+4]]; 480 sum[5] += aval[j+5] * x[acolidx[j+5]]; 481 sum[6] += aval[j+6] * x[acolidx[j+6]]; 482 sum[7] += aval[j+7] * x[acolidx[j+7]]; 483 } 484 if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 485 for(j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j]; 486 } else { 487 for(j=0; j<8; j++) y[8*i+j] = sum[j]; 488 } 489 } 490 #endif 491 492 ierr = PetscLogFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); /* theoretical minimal FLOPs */ 493 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 494 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 495 PetscFunctionReturn(0); 496 } 497 498 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 499 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz) 500 { 501 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 502 PetscScalar *y,*z; 503 const PetscScalar *x; 504 const MatScalar *aval=a->val; 505 PetscInt totalslices=a->totalslices; 506 const PetscInt *acolidx=a->colidx; 507 PetscInt i,j; 508 PetscErrorCode ierr; 509 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 510 __m512d vec_x,vec_y,vec_vals; 511 __m256i vec_idx; 512 __mmask8 mask; 513 __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4; 514 __m256i vec_idx2,vec_idx3,vec_idx4; 515 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 516 __m128d vec_x_tmp; 517 __m256d vec_x,vec_y,vec_y2,vec_vals; 518 MatScalar yval; 519 PetscInt r,row,nnz_in_row; 520 #else 521 PetscScalar sum[8]; 522 #endif 523 524 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 525 #pragma disjoint(*x,*y,*aval) 526 #endif 527 528 PetscFunctionBegin; 529 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 530 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 531 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 532 for (i=0; i<totalslices; i++) { /* loop over slices */ 533 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 534 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 535 536 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 537 mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07))); 538 vec_y = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]); 539 } else { 540 vec_y = _mm512_loadu_pd(&y[8*i]); 541 } 542 vec_y2 = _mm512_setzero_pd(); 543 vec_y3 = _mm512_setzero_pd(); 544 vec_y4 = _mm512_setzero_pd(); 545 546 j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */ 547 switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) { 548 case 3: 549 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 550 acolidx += 8; aval += 8; 551 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 552 acolidx += 8; aval += 8; 553 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 554 acolidx += 8; aval += 8; 555 j += 3; 556 break; 557 case 2: 558 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 559 acolidx += 8; aval += 8; 560 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 561 acolidx += 8; aval += 8; 562 j += 2; 563 break; 564 case 1: 565 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 566 acolidx += 8; aval += 8; 567 j += 1; 568 break; 569 } 570 #pragma novector 571 for (; j<(a->sliidx[i+1]>>3); j+=4) { 572 AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y); 573 acolidx += 8; aval += 8; 574 AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2); 575 acolidx += 8; aval += 8; 576 AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3); 577 acolidx += 8; aval += 8; 578 AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4); 579 acolidx += 8; aval += 8; 580 } 581 582 vec_y = _mm512_add_pd(vec_y,vec_y2); 583 vec_y = _mm512_add_pd(vec_y,vec_y3); 584 vec_y = _mm512_add_pd(vec_y,vec_y4); 585 if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 586 _mm512_mask_storeu_pd(&z[8*i],mask,vec_y); 587 } else { 588 _mm512_storeu_pd(&z[8*i],vec_y); 589 } 590 } 591 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 592 for (i=0; i<totalslices; i++) { /* loop over full slices */ 593 PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 594 PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0); 595 596 /* last slice may have padding rows. Don't use vectorization. */ 597 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 598 for (r=0; r<(A->rmap->n & 0x07); ++r) { 599 row = 8*i + r; 600 yval = (MatScalar)0.0; 601 nnz_in_row = a->rlen[row]; 602 for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]]; 603 z[row] = y[row] + yval; 604 } 605 break; 606 } 607 608 vec_y = _mm256_loadu_pd(y+8*i); 609 vec_y2 = _mm256_loadu_pd(y+8*i+4); 610 611 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 612 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 613 vec_vals = _mm256_loadu_pd(aval); 614 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 615 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 616 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 617 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 618 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 619 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 620 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y); 621 aval += 4; 622 623 vec_vals = _mm256_loadu_pd(aval); 624 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 625 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 626 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0); 627 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 628 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 629 vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1); 630 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2); 631 aval += 4; 632 } 633 634 _mm256_storeu_pd(z+i*8,vec_y); 635 _mm256_storeu_pd(z+i*8+4,vec_y2); 636 } 637 #else 638 for (i=0; i<totalslices; i++) { /* loop over slices */ 639 for (j=0; j<8; j++) sum[j] = 0.0; 640 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 641 sum[0] += aval[j] * x[acolidx[j]]; 642 sum[1] += aval[j+1] * x[acolidx[j+1]]; 643 sum[2] += aval[j+2] * x[acolidx[j+2]]; 644 sum[3] += aval[j+3] * x[acolidx[j+3]]; 645 sum[4] += aval[j+4] * x[acolidx[j+4]]; 646 sum[5] += aval[j+5] * x[acolidx[j+5]]; 647 sum[6] += aval[j+6] * x[acolidx[j+6]]; 648 sum[7] += aval[j+7] * x[acolidx[j+7]]; 649 } 650 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 651 for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j]; 652 } else { 653 for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j]; 654 } 655 } 656 #endif 657 658 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 659 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 660 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 661 PetscFunctionReturn(0); 662 } 663 664 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy) 665 { 666 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 667 PetscScalar *y; 668 const PetscScalar *x; 669 const MatScalar *aval=a->val; 670 const PetscInt *acolidx=a->colidx; 671 PetscInt i,j,r,row,nnz_in_row,totalslices=a->totalslices; 672 PetscErrorCode ierr; 673 674 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 675 #pragma disjoint(*x,*y,*aval) 676 #endif 677 678 PetscFunctionBegin; 679 if (A->symmetric) { 680 ierr = MatMultAdd_SeqSELL(A,xx,zz,yy);CHKERRQ(ierr); 681 PetscFunctionReturn(0); 682 } 683 if (zz != yy) { ierr = VecCopy(zz,yy);CHKERRQ(ierr); } 684 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 685 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 686 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 687 if (i == totalslices-1 && (A->rmap->n & 0x07)) { 688 for (r=0; r<(A->rmap->n & 0x07); ++r) { 689 row = 8*i + r; 690 nnz_in_row = a->rlen[row]; 691 for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row]; 692 } 693 break; 694 } 695 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) { 696 y[acolidx[j]] += aval[j] * x[8*i]; 697 y[acolidx[j+1]] += aval[j+1] * x[8*i+1]; 698 y[acolidx[j+2]] += aval[j+2] * x[8*i+2]; 699 y[acolidx[j+3]] += aval[j+3] * x[8*i+3]; 700 y[acolidx[j+4]] += aval[j+4] * x[8*i+4]; 701 y[acolidx[j+5]] += aval[j+5] * x[8*i+5]; 702 y[acolidx[j+6]] += aval[j+6] * x[8*i+6]; 703 y[acolidx[j+7]] += aval[j+7] * x[8*i+7]; 704 } 705 } 706 ierr = PetscLogFlops(2.0*a->sliidx[a->totalslices]);CHKERRQ(ierr); 707 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 708 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 709 PetscFunctionReturn(0); 710 } 711 712 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy) 713 { 714 PetscErrorCode ierr; 715 716 PetscFunctionBegin; 717 if (A->symmetric) { 718 ierr = MatMult_SeqSELL(A,xx,yy);CHKERRQ(ierr); 719 } else { 720 ierr = VecSet(yy,0.0);CHKERRQ(ierr); 721 ierr = MatMultTransposeAdd_SeqSELL(A,xx,yy,yy);CHKERRQ(ierr); 722 } 723 PetscFunctionReturn(0); 724 } 725 726 /* 727 Checks for missing diagonals 728 */ 729 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool *missing,PetscInt *d) 730 { 731 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 732 PetscInt *diag,i; 733 PetscErrorCode ierr; 734 735 PetscFunctionBegin; 736 *missing = PETSC_FALSE; 737 if (A->rmap->n > 0 && !(a->colidx)) { 738 *missing = PETSC_TRUE; 739 if (d) *d = 0; 740 ierr = PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");CHKERRQ(ierr); 741 } else { 742 diag = a->diag; 743 for (i=0; i<A->rmap->n; i++) { 744 if (diag[i] == -1) { 745 *missing = PETSC_TRUE; 746 if (d) *d = i; 747 ierr = PetscInfo1(A,"Matrix is missing diagonal number %D\n",i);CHKERRQ(ierr); 748 break; 749 } 750 } 751 } 752 PetscFunctionReturn(0); 753 } 754 755 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 756 { 757 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 758 PetscInt i,j,m=A->rmap->n,shift; 759 PetscErrorCode ierr; 760 761 PetscFunctionBegin; 762 if (!a->diag) { 763 ierr = PetscMalloc1(m,&a->diag);CHKERRQ(ierr); 764 ierr = PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));CHKERRQ(ierr); 765 a->free_diag = PETSC_TRUE; 766 } 767 for (i=0; i<m; i++) { /* loop over rows */ 768 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 769 a->diag[i] = -1; 770 for (j=0; j<a->rlen[i]; j++) { 771 if (a->colidx[shift+j*8] == i) { 772 a->diag[i] = shift+j*8; 773 break; 774 } 775 } 776 } 777 PetscFunctionReturn(0); 778 } 779 780 /* 781 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 782 */ 783 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift) 784 { 785 Mat_SeqSELL *a=(Mat_SeqSELL*) A->data; 786 PetscInt i,*diag,m = A->rmap->n; 787 MatScalar *val = a->val; 788 PetscScalar *idiag,*mdiag; 789 PetscErrorCode ierr; 790 791 PetscFunctionBegin; 792 if (a->idiagvalid) PetscFunctionReturn(0); 793 ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr); 794 diag = a->diag; 795 if (!a->idiag) { 796 ierr = PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work);CHKERRQ(ierr); 797 ierr = PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar));CHKERRQ(ierr); 798 val = a->val; 799 } 800 mdiag = a->mdiag; 801 idiag = a->idiag; 802 803 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 804 for (i=0; i<m; i++) { 805 mdiag[i] = val[diag[i]]; 806 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 807 if (PetscRealPart(fshift)) { 808 ierr = PetscInfo1(A,"Zero diagonal on row %D\n",i);CHKERRQ(ierr); 809 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 810 A->factorerror_zeropivot_value = 0.0; 811 A->factorerror_zeropivot_row = i; 812 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %D",i); 813 } 814 idiag[i] = 1.0/val[diag[i]]; 815 } 816 ierr = PetscLogFlops(m);CHKERRQ(ierr); 817 } else { 818 for (i=0; i<m; i++) { 819 mdiag[i] = val[diag[i]]; 820 idiag[i] = omega/(fshift + val[diag[i]]); 821 } 822 ierr = PetscLogFlops(2.0*m);CHKERRQ(ierr); 823 } 824 a->idiagvalid = PETSC_TRUE; 825 PetscFunctionReturn(0); 826 } 827 828 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 829 { 830 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 831 PetscErrorCode ierr; 832 833 PetscFunctionBegin; 834 ierr = PetscArrayzero(a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr); 835 ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr); 836 PetscFunctionReturn(0); 837 } 838 839 PetscErrorCode MatDestroy_SeqSELL(Mat A) 840 { 841 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 842 PetscErrorCode ierr; 843 844 PetscFunctionBegin; 845 #if defined(PETSC_USE_LOG) 846 PetscLogObjectState((PetscObject)A,"Rows=%D, Cols=%D, NZ=%D",A->rmap->n,A->cmap->n,a->nz); 847 #endif 848 ierr = MatSeqXSELLFreeSELL(A,&a->val,&a->colidx);CHKERRQ(ierr); 849 ierr = ISDestroy(&a->row);CHKERRQ(ierr); 850 ierr = ISDestroy(&a->col);CHKERRQ(ierr); 851 ierr = PetscFree(a->diag);CHKERRQ(ierr); 852 ierr = PetscFree(a->rlen);CHKERRQ(ierr); 853 ierr = PetscFree(a->sliidx);CHKERRQ(ierr); 854 ierr = PetscFree3(a->idiag,a->mdiag,a->ssor_work);CHKERRQ(ierr); 855 ierr = PetscFree(a->solve_work);CHKERRQ(ierr); 856 ierr = ISDestroy(&a->icol);CHKERRQ(ierr); 857 ierr = PetscFree(a->saved_values);CHKERRQ(ierr); 858 ierr = PetscFree2(a->getrowcols,a->getrowvals);CHKERRQ(ierr); 859 860 ierr = PetscFree(A->data);CHKERRQ(ierr); 861 862 ierr = PetscObjectChangeTypeName((PetscObject)A,0);CHKERRQ(ierr); 863 ierr = PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);CHKERRQ(ierr); 864 ierr = PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);CHKERRQ(ierr); 865 #if defined(PETSC_HAVE_ELEMENTAL) 866 #endif 867 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL);CHKERRQ(ierr); 868 PetscFunctionReturn(0); 869 } 870 871 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg) 872 { 873 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 874 PetscErrorCode ierr; 875 876 PetscFunctionBegin; 877 switch (op) { 878 case MAT_ROW_ORIENTED: 879 a->roworiented = flg; 880 break; 881 case MAT_KEEP_NONZERO_PATTERN: 882 a->keepnonzeropattern = flg; 883 break; 884 case MAT_NEW_NONZERO_LOCATIONS: 885 a->nonew = (flg ? 0 : 1); 886 break; 887 case MAT_NEW_NONZERO_LOCATION_ERR: 888 a->nonew = (flg ? -1 : 0); 889 break; 890 case MAT_NEW_NONZERO_ALLOCATION_ERR: 891 a->nonew = (flg ? -2 : 0); 892 break; 893 case MAT_UNUSED_NONZERO_LOCATION_ERR: 894 a->nounused = (flg ? -1 : 0); 895 break; 896 case MAT_NEW_DIAGONALS: 897 case MAT_IGNORE_OFF_PROC_ENTRIES: 898 case MAT_USE_HASH_TABLE: 899 case MAT_SORTED_FULL: 900 ierr = PetscInfo1(A,"Option %s ignored\n",MatOptions[op]);CHKERRQ(ierr); 901 break; 902 case MAT_SPD: 903 case MAT_SYMMETRIC: 904 case MAT_STRUCTURALLY_SYMMETRIC: 905 case MAT_HERMITIAN: 906 case MAT_SYMMETRY_ETERNAL: 907 /* These options are handled directly by MatSetOption() */ 908 break; 909 default: 910 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op); 911 } 912 PetscFunctionReturn(0); 913 } 914 915 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v) 916 { 917 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 918 PetscInt i,j,n,shift; 919 PetscScalar *x,zero=0.0; 920 PetscErrorCode ierr; 921 922 PetscFunctionBegin; 923 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 924 if (n != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 925 926 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 927 PetscInt *diag=a->diag; 928 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 929 for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]]; 930 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 931 PetscFunctionReturn(0); 932 } 933 934 ierr = VecSet(v,zero);CHKERRQ(ierr); 935 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 936 for (i=0; i<n; i++) { /* loop over rows */ 937 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 938 x[i] = 0; 939 for (j=0; j<a->rlen[i]; j++) { 940 if (a->colidx[shift+j*8] == i) { 941 x[i] = a->val[shift+j*8]; 942 break; 943 } 944 } 945 } 946 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 947 PetscFunctionReturn(0); 948 } 949 950 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr) 951 { 952 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 953 const PetscScalar *l,*r; 954 PetscInt i,j,m,n,row; 955 PetscErrorCode ierr; 956 957 PetscFunctionBegin; 958 if (ll) { 959 /* The local size is used so that VecMPI can be passed to this routine 960 by MatDiagonalScale_MPISELL */ 961 ierr = VecGetLocalSize(ll,&m);CHKERRQ(ierr); 962 if (m != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 963 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 964 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 965 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 966 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 967 if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row]; 968 } 969 } else { 970 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 971 a->val[j] *= l[8*i+row]; 972 } 973 } 974 } 975 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 976 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 977 } 978 if (rr) { 979 ierr = VecGetLocalSize(rr,&n);CHKERRQ(ierr); 980 if (n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 981 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 982 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 983 if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */ 984 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 985 if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]]; 986 } 987 } else { 988 for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) { 989 a->val[j] *= r[a->colidx[j]]; 990 } 991 } 992 } 993 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 994 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 995 } 996 ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr); 997 PetscFunctionReturn(0); 998 } 999 1000 extern PetscErrorCode MatSetValues_SeqSELL(Mat,PetscInt,const PetscInt[],PetscInt,const PetscInt[],const PetscScalar[],InsertMode); 1001 1002 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[]) 1003 { 1004 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1005 PetscInt *cp,i,k,low,high,t,row,col,l; 1006 PetscInt shift; 1007 MatScalar *vp; 1008 1009 PetscFunctionBegin; 1010 for (k=0; k<m; k++) { /* loop over requested rows */ 1011 row = im[k]; 1012 if (row<0) continue; 1013 #if defined(PETSC_USE_DEBUG) 1014 if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1); 1015 #endif 1016 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1017 cp = a->colidx+shift; /* pointer to the row */ 1018 vp = a->val+shift; /* pointer to the row */ 1019 for (l=0; l<n; l++) { /* loop over requested columns */ 1020 col = in[l]; 1021 if (col<0) continue; 1022 #if defined(PETSC_USE_DEBUG) 1023 if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %D max %D",col,A->cmap->n-1); 1024 #endif 1025 high = a->rlen[row]; low = 0; /* assume unsorted */ 1026 while (high-low > 5) { 1027 t = (low+high)/2; 1028 if (*(cp+t*8) > col) high = t; 1029 else low = t; 1030 } 1031 for (i=low; i<high; i++) { 1032 if (*(cp+8*i) > col) break; 1033 if (*(cp+8*i) == col) { 1034 *v++ = *(vp+8*i); 1035 goto finished; 1036 } 1037 } 1038 *v++ = 0.0; 1039 finished:; 1040 } 1041 } 1042 PetscFunctionReturn(0); 1043 } 1044 1045 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer) 1046 { 1047 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1048 PetscInt i,j,m=A->rmap->n,shift; 1049 const char *name; 1050 PetscViewerFormat format; 1051 PetscErrorCode ierr; 1052 1053 PetscFunctionBegin; 1054 ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr); 1055 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1056 PetscInt nofinalvalue = 0; 1057 /* 1058 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) { 1059 nofinalvalue = 1; 1060 } 1061 */ 1062 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr); 1063 ierr = PetscViewerASCIIPrintf(viewer,"%% Size = %D %D \n",m,A->cmap->n);CHKERRQ(ierr); 1064 ierr = PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %D \n",a->nz);CHKERRQ(ierr); 1065 #if defined(PETSC_USE_COMPLEX) 1066 ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,4);\n",a->nz+nofinalvalue);CHKERRQ(ierr); 1067 #else 1068 ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,3);\n",a->nz+nofinalvalue);CHKERRQ(ierr); 1069 #endif 1070 ierr = PetscViewerASCIIPrintf(viewer,"zzz = [\n");CHKERRQ(ierr); 1071 1072 for (i=0; i<m; i++) { 1073 shift = a->sliidx[i>>3]+(i&0x07); 1074 for (j=0; j<a->rlen[i]; j++) { 1075 #if defined(PETSC_USE_COMPLEX) 1076 ierr = PetscViewerASCIIPrintf(viewer,"%D %D %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr); 1077 #else 1078 ierr = PetscViewerASCIIPrintf(viewer,"%D %D %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]);CHKERRQ(ierr); 1079 #endif 1080 } 1081 } 1082 /* 1083 if (nofinalvalue) { 1084 #if defined(PETSC_USE_COMPLEX) 1085 ierr = PetscViewerASCIIPrintf(viewer,"%D %D %18.16e %18.16e\n",m,A->cmap->n,0.,0.);CHKERRQ(ierr); 1086 #else 1087 ierr = PetscViewerASCIIPrintf(viewer,"%D %D %18.16e\n",m,A->cmap->n,0.0);CHKERRQ(ierr); 1088 #endif 1089 } 1090 */ 1091 ierr = PetscObjectGetName((PetscObject)A,&name);CHKERRQ(ierr); 1092 ierr = PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name);CHKERRQ(ierr); 1093 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr); 1094 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1095 PetscFunctionReturn(0); 1096 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1097 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr); 1098 for (i=0; i<m; i++) { 1099 ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr); 1100 shift = a->sliidx[i>>3]+(i&0x07); 1101 for (j=0; j<a->rlen[i]; j++) { 1102 #if defined(PETSC_USE_COMPLEX) 1103 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1104 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr); 1105 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) { 1106 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr); 1107 } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) { 1108 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr); 1109 } 1110 #else 1111 if (a->val[shift+8*j] != 0.0) {ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);} 1112 #endif 1113 } 1114 ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr); 1115 } 1116 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr); 1117 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1118 PetscInt cnt=0,jcnt; 1119 PetscScalar value; 1120 #if defined(PETSC_USE_COMPLEX) 1121 PetscBool realonly=PETSC_TRUE; 1122 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1123 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1124 realonly = PETSC_FALSE; 1125 break; 1126 } 1127 } 1128 #endif 1129 1130 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr); 1131 for (i=0; i<m; i++) { 1132 jcnt = 0; 1133 shift = a->sliidx[i>>3]+(i&0x07); 1134 for (j=0; j<A->cmap->n; j++) { 1135 if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) { 1136 value = a->val[cnt++]; 1137 jcnt++; 1138 } else { 1139 value = 0.0; 1140 } 1141 #if defined(PETSC_USE_COMPLEX) 1142 if (realonly) { 1143 ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value));CHKERRQ(ierr); 1144 } else { 1145 ierr = PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value));CHKERRQ(ierr); 1146 } 1147 #else 1148 ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value);CHKERRQ(ierr); 1149 #endif 1150 } 1151 ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr); 1152 } 1153 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr); 1154 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1155 PetscInt fshift=1; 1156 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr); 1157 #if defined(PETSC_USE_COMPLEX) 1158 ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n");CHKERRQ(ierr); 1159 #else 1160 ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n");CHKERRQ(ierr); 1161 #endif 1162 ierr = PetscViewerASCIIPrintf(viewer,"%D %D %D\n", m, A->cmap->n, a->nz);CHKERRQ(ierr); 1163 for (i=0; i<m; i++) { 1164 shift = a->sliidx[i>>3]+(i&0x07); 1165 for (j=0; j<a->rlen[i]; j++) { 1166 #if defined(PETSC_USE_COMPLEX) 1167 ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr); 1168 #else 1169 ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]);CHKERRQ(ierr); 1170 #endif 1171 } 1172 } 1173 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr); 1174 } else if (format == PETSC_VIEWER_NATIVE) { 1175 for (i=0; i<a->totalslices; i++) { /* loop over slices */ 1176 PetscInt row; 1177 ierr = PetscViewerASCIIPrintf(viewer,"slice %D: %D %D\n",i,a->sliidx[i],a->sliidx[i+1]);CHKERRQ(ierr); 1178 for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) { 1179 #if defined(PETSC_USE_COMPLEX) 1180 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1181 ierr = PetscViewerASCIIPrintf(viewer," %D %D %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr); 1182 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1183 ierr = PetscViewerASCIIPrintf(viewer," %D %D %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr); 1184 } else { 1185 ierr = PetscViewerASCIIPrintf(viewer," %D %D %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr); 1186 } 1187 #else 1188 ierr = PetscViewerASCIIPrintf(viewer," %D %D %g\n",8*i+row,a->colidx[j],(double)a->val[j]);CHKERRQ(ierr); 1189 #endif 1190 } 1191 } 1192 } else { 1193 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr); 1194 if (A->factortype) { 1195 for (i=0; i<m; i++) { 1196 shift = a->sliidx[i>>3]+(i&0x07); 1197 ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr); 1198 /* L part */ 1199 for (j=shift; j<a->diag[i]; j+=8) { 1200 #if defined(PETSC_USE_COMPLEX) 1201 if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) { 1202 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr); 1203 } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) { 1204 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr); 1205 } else { 1206 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr); 1207 } 1208 #else 1209 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr); 1210 #endif 1211 } 1212 /* diagonal */ 1213 j = a->diag[i]; 1214 #if defined(PETSC_USE_COMPLEX) 1215 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1216 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]));CHKERRQ(ierr); 1217 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1218 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])));CHKERRQ(ierr); 1219 } else { 1220 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]));CHKERRQ(ierr); 1221 } 1222 #else 1223 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)(1.0/a->val[j]));CHKERRQ(ierr); 1224 #endif 1225 1226 /* U part */ 1227 for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) { 1228 #if defined(PETSC_USE_COMPLEX) 1229 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1230 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr); 1231 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1232 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr); 1233 } else { 1234 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr); 1235 } 1236 #else 1237 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr); 1238 #endif 1239 } 1240 ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr); 1241 } 1242 } else { 1243 for (i=0; i<m; i++) { 1244 shift = a->sliidx[i>>3]+(i&0x07); 1245 ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr); 1246 for (j=0; j<a->rlen[i]; j++) { 1247 #if defined(PETSC_USE_COMPLEX) 1248 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1249 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr); 1250 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1251 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr); 1252 } else { 1253 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr); 1254 } 1255 #else 1256 ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr); 1257 #endif 1258 } 1259 ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr); 1260 } 1261 } 1262 ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr); 1263 } 1264 ierr = PetscViewerFlush(viewer);CHKERRQ(ierr); 1265 PetscFunctionReturn(0); 1266 } 1267 1268 #include <petscdraw.h> 1269 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa) 1270 { 1271 Mat A=(Mat)Aa; 1272 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1273 PetscInt i,j,m=A->rmap->n,shift; 1274 int color; 1275 PetscReal xl,yl,xr,yr,x_l,x_r,y_l,y_r; 1276 PetscViewer viewer; 1277 PetscViewerFormat format; 1278 PetscErrorCode ierr; 1279 1280 PetscFunctionBegin; 1281 ierr = PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);CHKERRQ(ierr); 1282 ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr); 1283 ierr = PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);CHKERRQ(ierr); 1284 1285 /* loop over matrix elements drawing boxes */ 1286 1287 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1288 ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr); 1289 /* Blue for negative, Cyan for zero and Red for positive */ 1290 color = PETSC_DRAW_BLUE; 1291 for (i=0; i<m; i++) { 1292 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1293 y_l = m - i - 1.0; y_r = y_l + 1.0; 1294 for (j=0; j<a->rlen[i]; j++) { 1295 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1296 if (PetscRealPart(a->val[shift+8*j]) >= 0.) continue; 1297 ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr); 1298 } 1299 } 1300 color = PETSC_DRAW_CYAN; 1301 for (i=0; i<m; i++) { 1302 shift = a->sliidx[i>>3]+(i&0x07); 1303 y_l = m - i - 1.0; y_r = y_l + 1.0; 1304 for (j=0; j<a->rlen[i]; j++) { 1305 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1306 if (a->val[shift+8*j] != 0.) continue; 1307 ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr); 1308 } 1309 } 1310 color = PETSC_DRAW_RED; 1311 for (i=0; i<m; i++) { 1312 shift = a->sliidx[i>>3]+(i&0x07); 1313 y_l = m - i - 1.0; y_r = y_l + 1.0; 1314 for (j=0; j<a->rlen[i]; j++) { 1315 x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0; 1316 if (PetscRealPart(a->val[shift+8*j]) <= 0.) continue; 1317 ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr); 1318 } 1319 } 1320 ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr); 1321 } else { 1322 /* use contour shading to indicate magnitude of values */ 1323 /* first determine max of all nonzero values */ 1324 PetscReal minv=0.0,maxv=0.0; 1325 PetscInt count=0; 1326 PetscDraw popup; 1327 for (i=0; i<a->sliidx[a->totalslices]; i++) { 1328 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1329 } 1330 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1331 ierr = PetscDrawGetPopup(draw,&popup);CHKERRQ(ierr); 1332 ierr = PetscDrawScalePopup(popup,minv,maxv);CHKERRQ(ierr); 1333 1334 ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr); 1335 for (i=0; i<m; i++) { 1336 shift = a->sliidx[i>>3]+(i&0x07); 1337 y_l = m - i - 1.0; 1338 y_r = y_l + 1.0; 1339 for (j=0; j<a->rlen[i]; j++) { 1340 x_l = a->colidx[shift+j*8]; 1341 x_r = x_l + 1.0; 1342 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv); 1343 ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr); 1344 count++; 1345 } 1346 } 1347 ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr); 1348 } 1349 PetscFunctionReturn(0); 1350 } 1351 1352 #include <petscdraw.h> 1353 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer) 1354 { 1355 PetscDraw draw; 1356 PetscReal xr,yr,xl,yl,h,w; 1357 PetscBool isnull; 1358 PetscErrorCode ierr; 1359 1360 PetscFunctionBegin; 1361 ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr); 1362 ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr); 1363 if (isnull) PetscFunctionReturn(0); 1364 1365 xr = A->cmap->n; yr = A->rmap->n; h = yr/10.0; w = xr/10.0; 1366 xr += w; yr += h; xl = -w; yl = -h; 1367 ierr = PetscDrawSetCoordinates(draw,xl,yl,xr,yr);CHKERRQ(ierr); 1368 ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);CHKERRQ(ierr); 1369 ierr = PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A);CHKERRQ(ierr); 1370 ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);CHKERRQ(ierr); 1371 ierr = PetscDrawSave(draw);CHKERRQ(ierr); 1372 PetscFunctionReturn(0); 1373 } 1374 1375 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer) 1376 { 1377 PetscBool iascii,isbinary,isdraw; 1378 PetscErrorCode ierr; 1379 1380 PetscFunctionBegin; 1381 ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr); 1382 ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);CHKERRQ(ierr); 1383 ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);CHKERRQ(ierr); 1384 if (iascii) { 1385 ierr = MatView_SeqSELL_ASCII(A,viewer);CHKERRQ(ierr); 1386 } else if (isbinary) { 1387 /* ierr = MatView_SeqSELL_Binary(A,viewer);CHKERRQ(ierr); */ 1388 } else if (isdraw) { 1389 ierr = MatView_SeqSELL_Draw(A,viewer);CHKERRQ(ierr); 1390 } 1391 PetscFunctionReturn(0); 1392 } 1393 1394 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode) 1395 { 1396 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1397 PetscInt i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k; 1398 MatScalar *vp; 1399 PetscErrorCode ierr; 1400 1401 PetscFunctionBegin; 1402 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1403 /* To do: compress out the unused elements */ 1404 ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr); 1405 ierr = PetscInfo6(A,"Matrix size: %D X %D; storage space: %D allocated %D used (%D nonzeros+%D paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz);CHKERRQ(ierr); 1406 ierr = PetscInfo1(A,"Number of mallocs during MatSetValues() is %D\n",a->reallocs);CHKERRQ(ierr); 1407 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rlenmax);CHKERRQ(ierr); 1408 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1409 for (i=0; i<a->totalslices; ++i) { 1410 shift = a->sliidx[i]; /* starting index of the slice */ 1411 cp = a->colidx+shift; /* pointer to the column indices of the slice */ 1412 vp = a->val+shift; /* pointer to the nonzero values of the slice */ 1413 for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */ 1414 row = 8*i + row_in_slice; 1415 nrow = a->rlen[row]; /* number of nonzeros in row */ 1416 /* 1417 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1418 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1419 */ 1420 lastcol = 0; 1421 if (nrow>0) { /* nonempty row */ 1422 lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */ 1423 } else if (!row_in_slice) { /* first row of the currect slice is empty */ 1424 for (j=1;j<8;j++) { 1425 if (a->rlen[8*i+j]) { 1426 lastcol = cp[j]; 1427 break; 1428 } 1429 } 1430 } else { 1431 if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */ 1432 } 1433 1434 for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) { 1435 cp[8*k+row_in_slice] = lastcol; 1436 vp[8*k+row_in_slice] = (MatScalar)0; 1437 } 1438 } 1439 } 1440 1441 A->info.mallocs += a->reallocs; 1442 a->reallocs = 0; 1443 1444 ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr); 1445 PetscFunctionReturn(0); 1446 } 1447 1448 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info) 1449 { 1450 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1451 1452 PetscFunctionBegin; 1453 info->block_size = 1.0; 1454 info->nz_allocated = a->maxallocmat; 1455 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1456 info->nz_unneeded = (a->maxallocmat-a->sliidx[a->totalslices]); 1457 info->assemblies = A->num_ass; 1458 info->mallocs = A->info.mallocs; 1459 info->memory = ((PetscObject)A)->mem; 1460 if (A->factortype) { 1461 info->fill_ratio_given = A->info.fill_ratio_given; 1462 info->fill_ratio_needed = A->info.fill_ratio_needed; 1463 info->factor_mallocs = A->info.factor_mallocs; 1464 } else { 1465 info->fill_ratio_given = 0; 1466 info->fill_ratio_needed = 0; 1467 info->factor_mallocs = 0; 1468 } 1469 PetscFunctionReturn(0); 1470 } 1471 1472 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 1473 { 1474 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1475 PetscInt shift,i,k,l,low,high,t,ii,row,col,nrow; 1476 PetscInt *cp,nonew=a->nonew,lastcol=-1; 1477 MatScalar *vp,value; 1478 PetscErrorCode ierr; 1479 1480 PetscFunctionBegin; 1481 for (k=0; k<m; k++) { /* loop over added rows */ 1482 row = im[k]; 1483 if (row < 0) continue; 1484 #if defined(PETSC_USE_DEBUG) 1485 if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1); 1486 #endif 1487 shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */ 1488 cp = a->colidx+shift; /* pointer to the row */ 1489 vp = a->val+shift; /* pointer to the row */ 1490 nrow = a->rlen[row]; 1491 low = 0; 1492 high = nrow; 1493 1494 for (l=0; l<n; l++) { /* loop over added columns */ 1495 col = in[l]; 1496 if (col<0) continue; 1497 #if defined(PETSC_USE_DEBUG) 1498 if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %D max %D",col,A->cmap->n-1); 1499 #endif 1500 if (a->roworiented) { 1501 value = v[l+k*n]; 1502 } else { 1503 value = v[k+l*m]; 1504 } 1505 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1506 1507 /* search in this row for the specified colmun, i indicates the column to be set */ 1508 if (col <= lastcol) low = 0; 1509 else high = nrow; 1510 lastcol = col; 1511 while (high-low > 5) { 1512 t = (low+high)/2; 1513 if (*(cp+t*8) > col) high = t; 1514 else low = t; 1515 } 1516 for (i=low; i<high; i++) { 1517 if (*(cp+i*8) > col) break; 1518 if (*(cp+i*8) == col) { 1519 if (is == ADD_VALUES) *(vp+i*8) += value; 1520 else *(vp+i*8) = value; 1521 low = i + 1; 1522 goto noinsert; 1523 } 1524 } 1525 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1526 if (nonew == 1) goto noinsert; 1527 if (nonew == -1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%D, %D) in the matrix", row, col); 1528 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1529 MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar); 1530 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1531 for (ii=nrow-1; ii>=i; ii--) { 1532 *(cp+(ii+1)*8) = *(cp+ii*8); 1533 *(vp+(ii+1)*8) = *(vp+ii*8); 1534 } 1535 a->rlen[row]++; 1536 *(cp+i*8) = col; 1537 *(vp+i*8) = value; 1538 a->nz++; 1539 A->nonzerostate++; 1540 low = i+1; high++; nrow++; 1541 noinsert:; 1542 } 1543 a->rlen[row] = nrow; 1544 } 1545 PetscFunctionReturn(0); 1546 } 1547 1548 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str) 1549 { 1550 PetscErrorCode ierr; 1551 1552 PetscFunctionBegin; 1553 /* If the two matrices have the same copy implementation, use fast copy. */ 1554 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1555 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1556 Mat_SeqSELL *b=(Mat_SeqSELL*)B->data; 1557 1558 if (a->sliidx[a->totalslices] != b->sliidx[b->totalslices]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different"); 1559 ierr = PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr); 1560 } else { 1561 ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr); 1562 } 1563 PetscFunctionReturn(0); 1564 } 1565 1566 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1567 { 1568 PetscErrorCode ierr; 1569 1570 PetscFunctionBegin; 1571 ierr = MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,0);CHKERRQ(ierr); 1572 PetscFunctionReturn(0); 1573 } 1574 1575 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[]) 1576 { 1577 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1578 1579 PetscFunctionBegin; 1580 *array = a->val; 1581 PetscFunctionReturn(0); 1582 } 1583 1584 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[]) 1585 { 1586 PetscFunctionBegin; 1587 PetscFunctionReturn(0); 1588 } 1589 1590 PetscErrorCode MatRealPart_SeqSELL(Mat A) 1591 { 1592 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1593 PetscInt i; 1594 MatScalar *aval=a->val; 1595 1596 PetscFunctionBegin; 1597 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]); 1598 PetscFunctionReturn(0); 1599 } 1600 1601 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) 1602 { 1603 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1604 PetscInt i; 1605 MatScalar *aval=a->val; 1606 PetscErrorCode ierr; 1607 1608 PetscFunctionBegin; 1609 for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]); 1610 ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr); 1611 PetscFunctionReturn(0); 1612 } 1613 1614 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha) 1615 { 1616 Mat_SeqSELL *a=(Mat_SeqSELL*)inA->data; 1617 MatScalar *aval=a->val; 1618 PetscScalar oalpha=alpha; 1619 PetscBLASInt one=1,size; 1620 PetscErrorCode ierr; 1621 1622 PetscFunctionBegin; 1623 ierr = PetscBLASIntCast(a->sliidx[a->totalslices],&size);CHKERRQ(ierr); 1624 PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one)); 1625 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1626 ierr = MatSeqSELLInvalidateDiagonal(inA);CHKERRQ(ierr); 1627 PetscFunctionReturn(0); 1628 } 1629 1630 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a) 1631 { 1632 Mat_SeqSELL *y=(Mat_SeqSELL*)Y->data; 1633 PetscErrorCode ierr; 1634 1635 PetscFunctionBegin; 1636 if (!Y->preallocated || !y->nz) { 1637 ierr = MatSeqSELLSetPreallocation(Y,1,NULL);CHKERRQ(ierr); 1638 } 1639 ierr = MatShift_Basic(Y,a);CHKERRQ(ierr); 1640 PetscFunctionReturn(0); 1641 } 1642 1643 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1644 { 1645 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 1646 PetscScalar *x,sum,*t; 1647 const MatScalar *idiag=0,*mdiag; 1648 const PetscScalar *b,*xb; 1649 PetscInt n,m=A->rmap->n,i,j,shift; 1650 const PetscInt *diag; 1651 PetscErrorCode ierr; 1652 1653 PetscFunctionBegin; 1654 its = its*lits; 1655 1656 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1657 if (!a->idiagvalid) {ierr = MatInvertDiagonal_SeqSELL(A,omega,fshift);CHKERRQ(ierr);} 1658 a->fshift = fshift; 1659 a->omega = omega; 1660 1661 diag = a->diag; 1662 t = a->ssor_work; 1663 idiag = a->idiag; 1664 mdiag = a->mdiag; 1665 1666 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1667 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1668 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1669 if (flag == SOR_APPLY_UPPER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented"); 1670 if (flag == SOR_APPLY_LOWER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented"); 1671 if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat"); 1672 1673 if (flag & SOR_ZERO_INITIAL_GUESS) { 1674 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1675 for (i=0; i<m; i++) { 1676 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1677 sum = b[i]; 1678 n = (diag[i]-shift)/8; 1679 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1680 t[i] = sum; 1681 x[i] = sum*idiag[i]; 1682 } 1683 xb = t; 1684 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1685 } else xb = b; 1686 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1687 for (i=m-1; i>=0; i--) { 1688 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1689 sum = xb[i]; 1690 n = a->rlen[i]-(diag[i]-shift)/8-1; 1691 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1692 if (xb == b) { 1693 x[i] = sum*idiag[i]; 1694 } else { 1695 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1696 } 1697 } 1698 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */ 1699 } 1700 its--; 1701 } 1702 while (its--) { 1703 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1704 for (i=0; i<m; i++) { 1705 /* lower */ 1706 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1707 sum = b[i]; 1708 n = (diag[i]-shift)/8; 1709 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1710 t[i] = sum; /* save application of the lower-triangular part */ 1711 /* upper */ 1712 n = a->rlen[i]-(diag[i]-shift)/8-1; 1713 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1714 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1715 } 1716 xb = t; 1717 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1718 } else xb = b; 1719 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1720 for (i=m-1; i>=0; i--) { 1721 shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */ 1722 sum = xb[i]; 1723 if (xb == b) { 1724 /* whole matrix (no checkpointing available) */ 1725 n = a->rlen[i]; 1726 for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]]; 1727 x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i]; 1728 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1729 n = a->rlen[i]-(diag[i]-shift)/8-1; 1730 for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]]; 1731 x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */ 1732 } 1733 } 1734 if (xb == b) { 1735 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1736 } else { 1737 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */ 1738 } 1739 } 1740 } 1741 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1742 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1743 PetscFunctionReturn(0); 1744 } 1745 1746 /* -------------------------------------------------------------------*/ 1747 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1748 MatGetRow_SeqSELL, 1749 MatRestoreRow_SeqSELL, 1750 MatMult_SeqSELL, 1751 /* 4*/ MatMultAdd_SeqSELL, 1752 MatMultTranspose_SeqSELL, 1753 MatMultTransposeAdd_SeqSELL, 1754 0, 1755 0, 1756 0, 1757 /* 10*/ 0, 1758 0, 1759 0, 1760 MatSOR_SeqSELL, 1761 0, 1762 /* 15*/ MatGetInfo_SeqSELL, 1763 MatEqual_SeqSELL, 1764 MatGetDiagonal_SeqSELL, 1765 MatDiagonalScale_SeqSELL, 1766 0, 1767 /* 20*/ 0, 1768 MatAssemblyEnd_SeqSELL, 1769 MatSetOption_SeqSELL, 1770 MatZeroEntries_SeqSELL, 1771 /* 24*/ 0, 1772 0, 1773 0, 1774 0, 1775 0, 1776 /* 29*/ MatSetUp_SeqSELL, 1777 0, 1778 0, 1779 0, 1780 0, 1781 /* 34*/ MatDuplicate_SeqSELL, 1782 0, 1783 0, 1784 0, 1785 0, 1786 /* 39*/ 0, 1787 0, 1788 0, 1789 MatGetValues_SeqSELL, 1790 MatCopy_SeqSELL, 1791 /* 44*/ 0, 1792 MatScale_SeqSELL, 1793 MatShift_SeqSELL, 1794 0, 1795 0, 1796 /* 49*/ 0, 1797 0, 1798 0, 1799 0, 1800 0, 1801 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1802 0, 1803 0, 1804 0, 1805 0, 1806 /* 59*/ 0, 1807 MatDestroy_SeqSELL, 1808 MatView_SeqSELL, 1809 0, 1810 0, 1811 /* 64*/ 0, 1812 0, 1813 0, 1814 0, 1815 0, 1816 /* 69*/ 0, 1817 0, 1818 0, 1819 0, 1820 0, 1821 /* 74*/ 0, 1822 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1823 0, 1824 0, 1825 0, 1826 /* 79*/ 0, 1827 0, 1828 0, 1829 0, 1830 0, 1831 /* 84*/ 0, 1832 0, 1833 0, 1834 0, 1835 0, 1836 /* 89*/ 0, 1837 0, 1838 0, 1839 0, 1840 0, 1841 /* 94*/ 0, 1842 0, 1843 0, 1844 0, 1845 0, 1846 /* 99*/ 0, 1847 0, 1848 0, 1849 MatConjugate_SeqSELL, 1850 0, 1851 /*104*/ 0, 1852 0, 1853 0, 1854 0, 1855 0, 1856 /*109*/ 0, 1857 0, 1858 0, 1859 0, 1860 MatMissingDiagonal_SeqSELL, 1861 /*114*/ 0, 1862 0, 1863 0, 1864 0, 1865 0, 1866 /*119*/ 0, 1867 0, 1868 0, 1869 0, 1870 0, 1871 /*124*/ 0, 1872 0, 1873 0, 1874 0, 1875 0, 1876 /*129*/ 0, 1877 0, 1878 0, 1879 0, 1880 0, 1881 /*134*/ 0, 1882 0, 1883 0, 1884 0, 1885 0, 1886 /*139*/ 0, 1887 0, 1888 0, 1889 MatFDColoringSetUp_SeqXAIJ, 1890 0, 1891 /*144*/0 1892 }; 1893 1894 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1895 { 1896 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1897 PetscErrorCode ierr; 1898 1899 PetscFunctionBegin; 1900 if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1901 1902 /* allocate space for values if not already there */ 1903 if (!a->saved_values) { 1904 ierr = PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values);CHKERRQ(ierr); 1905 ierr = PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1906 } 1907 1908 /* copy values over */ 1909 ierr = PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr); 1910 PetscFunctionReturn(0); 1911 } 1912 1913 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1914 { 1915 Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data; 1916 PetscErrorCode ierr; 1917 1918 PetscFunctionBegin; 1919 if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1920 if (!a->saved_values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first"); 1921 ierr = PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices]);CHKERRQ(ierr); 1922 PetscFunctionReturn(0); 1923 } 1924 1925 /*@C 1926 MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray() 1927 1928 Not Collective 1929 1930 Input Parameters: 1931 . mat - a MATSEQSELL matrix 1932 . array - pointer to the data 1933 1934 Level: intermediate 1935 1936 .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90() 1937 @*/ 1938 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array) 1939 { 1940 PetscErrorCode ierr; 1941 1942 PetscFunctionBegin; 1943 ierr = PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));CHKERRQ(ierr); 1944 PetscFunctionReturn(0); 1945 } 1946 1947 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 1948 { 1949 Mat_SeqSELL *b; 1950 PetscMPIInt size; 1951 PetscErrorCode ierr; 1952 1953 PetscFunctionBegin; 1954 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr); 1955 if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1"); 1956 1957 ierr = PetscNewLog(B,&b);CHKERRQ(ierr); 1958 1959 B->data = (void*)b; 1960 1961 ierr = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr); 1962 1963 b->row = 0; 1964 b->col = 0; 1965 b->icol = 0; 1966 b->reallocs = 0; 1967 b->ignorezeroentries = PETSC_FALSE; 1968 b->roworiented = PETSC_TRUE; 1969 b->nonew = 0; 1970 b->diag = 0; 1971 b->solve_work = 0; 1972 B->spptr = 0; 1973 b->saved_values = 0; 1974 b->idiag = 0; 1975 b->mdiag = 0; 1976 b->ssor_work = 0; 1977 b->omega = 1.0; 1978 b->fshift = 0.0; 1979 b->idiagvalid = PETSC_FALSE; 1980 b->keepnonzeropattern = PETSC_FALSE; 1981 1982 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL);CHKERRQ(ierr); 1983 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL);CHKERRQ(ierr); 1984 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL);CHKERRQ(ierr); 1985 ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL);CHKERRQ(ierr); 1986 ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL);CHKERRQ(ierr); 1987 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL);CHKERRQ(ierr); 1988 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ);CHKERRQ(ierr); 1989 PetscFunctionReturn(0); 1990 } 1991 1992 /* 1993 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 1994 */ 1995 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace) 1996 { 1997 Mat_SeqSELL *c,*a=(Mat_SeqSELL*)A->data; 1998 PetscInt i,m=A->rmap->n; 1999 PetscInt totalslices=a->totalslices; 2000 PetscErrorCode ierr; 2001 2002 PetscFunctionBegin; 2003 c = (Mat_SeqSELL*)C->data; 2004 2005 C->factortype = A->factortype; 2006 c->row = 0; 2007 c->col = 0; 2008 c->icol = 0; 2009 c->reallocs = 0; 2010 2011 C->assembled = PETSC_TRUE; 2012 2013 ierr = PetscLayoutReference(A->rmap,&C->rmap);CHKERRQ(ierr); 2014 ierr = PetscLayoutReference(A->cmap,&C->cmap);CHKERRQ(ierr); 2015 2016 ierr = PetscMalloc1(8*totalslices,&c->rlen);CHKERRQ(ierr); 2017 ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr); 2018 ierr = PetscMalloc1(totalslices+1,&c->sliidx);CHKERRQ(ierr); 2019 ierr = PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr); 2020 2021 for (i=0; i<m; i++) c->rlen[i] = a->rlen[i]; 2022 for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i]; 2023 2024 /* allocate the matrix space */ 2025 if (mallocmatspace) { 2026 ierr = PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx);CHKERRQ(ierr); 2027 ierr = PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr); 2028 2029 c->singlemalloc = PETSC_TRUE; 2030 2031 if (m > 0) { 2032 ierr = PetscArraycpy(c->colidx,a->colidx,a->maxallocmat);CHKERRQ(ierr); 2033 if (cpvalues == MAT_COPY_VALUES) { 2034 ierr = PetscArraycpy(c->val,a->val,a->maxallocmat);CHKERRQ(ierr); 2035 } else { 2036 ierr = PetscArrayzero(c->val,a->maxallocmat);CHKERRQ(ierr); 2037 } 2038 } 2039 } 2040 2041 c->ignorezeroentries = a->ignorezeroentries; 2042 c->roworiented = a->roworiented; 2043 c->nonew = a->nonew; 2044 if (a->diag) { 2045 ierr = PetscMalloc1(m,&c->diag);CHKERRQ(ierr); 2046 ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr); 2047 for (i=0; i<m; i++) { 2048 c->diag[i] = a->diag[i]; 2049 } 2050 } else c->diag = 0; 2051 2052 c->solve_work = 0; 2053 c->saved_values = 0; 2054 c->idiag = 0; 2055 c->ssor_work = 0; 2056 c->keepnonzeropattern = a->keepnonzeropattern; 2057 c->free_val = PETSC_TRUE; 2058 c->free_colidx = PETSC_TRUE; 2059 2060 c->maxallocmat = a->maxallocmat; 2061 c->maxallocrow = a->maxallocrow; 2062 c->rlenmax = a->rlenmax; 2063 c->nz = a->nz; 2064 C->preallocated = PETSC_TRUE; 2065 2066 c->nonzerorowcnt = a->nonzerorowcnt; 2067 C->nonzerostate = A->nonzerostate; 2068 2069 ierr = PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);CHKERRQ(ierr); 2070 PetscFunctionReturn(0); 2071 } 2072 2073 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B) 2074 { 2075 PetscErrorCode ierr; 2076 2077 PetscFunctionBegin; 2078 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 2079 ierr = MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);CHKERRQ(ierr); 2080 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) { 2081 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 2082 } 2083 ierr = MatSetType(*B,((PetscObject)A)->type_name);CHKERRQ(ierr); 2084 ierr = MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE);CHKERRQ(ierr); 2085 PetscFunctionReturn(0); 2086 } 2087 2088 /*@C 2089 MatCreateSeqSELL - Creates a sparse matrix in SELL format. 2090 2091 Collective 2092 2093 Input Parameters: 2094 + comm - MPI communicator, set to PETSC_COMM_SELF 2095 . m - number of rows 2096 . n - number of columns 2097 . rlenmax - maximum number of nonzeros in a row 2098 - rlen - array containing the number of nonzeros in the various rows 2099 (possibly different for each row) or NULL 2100 2101 Output Parameter: 2102 . A - the matrix 2103 2104 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2105 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2106 [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation] 2107 2108 Notes: 2109 If nnz is given then nz is ignored 2110 2111 Specify the preallocated storage with either rlenmax or rlen (not both). 2112 Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory 2113 allocation. For large problems you MUST preallocate memory or you 2114 will get TERRIBLE performance, see the users' manual chapter on matrices. 2115 2116 Level: intermediate 2117 2118 .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatCreateSeqSELLWithArrays() 2119 2120 @*/ 2121 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A) 2122 { 2123 PetscErrorCode ierr; 2124 2125 PetscFunctionBegin; 2126 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2127 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2128 ierr = MatSetType(*A,MATSEQSELL);CHKERRQ(ierr); 2129 ierr = MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen);CHKERRQ(ierr); 2130 PetscFunctionReturn(0); 2131 } 2132 2133 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg) 2134 { 2135 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data; 2136 PetscInt totalslices=a->totalslices; 2137 PetscErrorCode ierr; 2138 2139 PetscFunctionBegin; 2140 /* If the matrix dimensions are not equal,or no of nonzeros */ 2141 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2142 *flg = PETSC_FALSE; 2143 PetscFunctionReturn(0); 2144 } 2145 /* if the a->colidx are the same */ 2146 ierr = PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg);CHKERRQ(ierr); 2147 if (!*flg) PetscFunctionReturn(0); 2148 /* if a->val are the same */ 2149 ierr = PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg);CHKERRQ(ierr); 2150 PetscFunctionReturn(0); 2151 } 2152 2153 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2154 { 2155 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2156 2157 PetscFunctionBegin; 2158 a->idiagvalid = PETSC_FALSE; 2159 PetscFunctionReturn(0); 2160 } 2161 2162 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2163 { 2164 #if defined(PETSC_USE_COMPLEX) 2165 Mat_SeqSELL *a=(Mat_SeqSELL*)A->data; 2166 PetscInt i; 2167 PetscScalar *val = a->val; 2168 2169 PetscFunctionBegin; 2170 for (i=0; i<a->sliidx[a->totalslices]; i++) { 2171 val[i] = PetscConj(val[i]); 2172 } 2173 #else 2174 PetscFunctionBegin; 2175 #endif 2176 PetscFunctionReturn(0); 2177 } 2178