1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 8 #include <immintrin.h> 9 #endif 10 11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 12 { 13 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 16 const PetscInt *idx; 17 PetscInt start,end,*ai,*aj,bs,*nidx2; 18 PetscBT table; 19 20 PetscFunctionBegin; 21 m = a->mbs; 22 ai = a->i; 23 aj = a->j; 24 bs = A->rmap->bs; 25 26 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 27 28 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 29 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 30 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 31 32 for (i=0; i<is_max; i++) { 33 /* Initialise the two local arrays */ 34 isz = 0; 35 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 36 37 /* Extract the indices, assume there can be duplicate entries */ 38 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 39 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 40 41 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 42 for (j=0; j<n; ++j) { 43 ival = idx[j]/bs; /* convert the indices into block indices */ 44 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 45 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++) { /* for each overlap*/ 52 n = isz; 53 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 54 row = nidx[k]; 55 start = ai[row]; 56 end = ai[row+1]; 57 for (l = start; l<end; l++) { 58 val = aj[l]; 59 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 60 } 61 } 62 } 63 /* expand the Index Set */ 64 for (j=0; j<isz; j++) { 65 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 66 } 67 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 68 } 69 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 70 ierr = PetscFree(nidx);CHKERRQ(ierr); 71 ierr = PetscFree(nidx2);CHKERRQ(ierr); 72 PetscFunctionReturn(0); 73 } 74 75 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 76 { 77 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 78 PetscErrorCode ierr; 79 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 80 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 81 const PetscInt *irow,*icol; 82 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 83 PetscInt *aj = a->j,*ai = a->i; 84 MatScalar *mat_a; 85 Mat C; 86 PetscBool flag; 87 88 PetscFunctionBegin; 89 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 90 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 91 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 92 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 93 94 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 95 ssmap = smap; 96 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 97 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 98 /* determine lens of each row */ 99 for (i=0; i<nrows; i++) { 100 kstart = ai[irow[i]]; 101 kend = kstart + a->ilen[irow[i]]; 102 lens[i] = 0; 103 for (k=kstart; k<kend; k++) { 104 if (ssmap[aj[k]]) lens[i]++; 105 } 106 } 107 /* Create and fill new matrix */ 108 if (scall == MAT_REUSE_MATRIX) { 109 c = (Mat_SeqBAIJ*)((*B)->data); 110 111 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 112 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 113 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 114 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 115 C = *B; 116 } else { 117 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 118 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 119 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 120 ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 121 } 122 c = (Mat_SeqBAIJ*)(C->data); 123 for (i=0; i<nrows; i++) { 124 row = irow[i]; 125 kstart = ai[row]; 126 kend = kstart + a->ilen[row]; 127 mat_i = c->i[i]; 128 mat_j = c->j + mat_i; 129 mat_a = c->a + mat_i*bs2; 130 mat_ilen = c->ilen + i; 131 for (k=kstart; k<kend; k++) { 132 if ((tcol=ssmap[a->j[k]])) { 133 *mat_j++ = tcol - 1; 134 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 135 mat_a += bs2; 136 (*mat_ilen)++; 137 } 138 } 139 } 140 /* sort */ 141 { 142 MatScalar *work; 143 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 144 for (i=0; i<nrows; i++) { 145 PetscInt ilen; 146 mat_i = c->i[i]; 147 mat_j = c->j + mat_i; 148 mat_a = c->a + mat_i*bs2; 149 ilen = c->ilen[i]; 150 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 151 } 152 ierr = PetscFree(work);CHKERRQ(ierr); 153 } 154 155 /* Free work space */ 156 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 157 ierr = PetscFree(smap);CHKERRQ(ierr); 158 ierr = PetscFree(lens);CHKERRQ(ierr); 159 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 160 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 161 162 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 163 *B = C; 164 PetscFunctionReturn(0); 165 } 166 167 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 168 { 169 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 170 IS is1,is2; 171 PetscErrorCode ierr; 172 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 173 const PetscInt *irow,*icol; 174 175 PetscFunctionBegin; 176 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 177 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 178 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 179 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 180 181 /* Verify if the indices corespond to each element in a block 182 and form the IS with compressed IS */ 183 maxmnbs = PetscMax(a->mbs,a->nbs); 184 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 185 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 186 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 187 for (i=0; i<a->mbs; i++) { 188 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 189 } 190 count = 0; 191 for (i=0; i<nrows; i++) { 192 j = irow[i] / bs; 193 if ((vary[j]--)==bs) iary[count++] = j; 194 } 195 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 196 197 ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr); 198 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 199 for (i=0; i<a->nbs; i++) { 200 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 201 } 202 count = 0; 203 for (i=0; i<ncols; i++) { 204 j = icol[i] / bs; 205 if ((vary[j]--)==bs) iary[count++] = j; 206 } 207 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 208 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 209 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 210 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 211 212 ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 213 ierr = ISDestroy(&is1);CHKERRQ(ierr); 214 ierr = ISDestroy(&is2);CHKERRQ(ierr); 215 PetscFunctionReturn(0); 216 } 217 218 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 219 { 220 PetscErrorCode ierr; 221 Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data; 222 Mat_SubSppt *submatj = c->submatis1; 223 224 PetscFunctionBegin; 225 ierr = (*submatj->destroy)(C);CHKERRQ(ierr); 226 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[]) 231 { 232 PetscErrorCode ierr; 233 PetscInt i; 234 Mat C; 235 Mat_SeqBAIJ *c; 236 Mat_SubSppt *submatj; 237 238 PetscFunctionBegin; 239 for (i=0; i<n; i++) { 240 C = (*mat)[i]; 241 c = (Mat_SeqBAIJ*)C->data; 242 submatj = c->submatis1; 243 if (submatj) { 244 ierr = (*submatj->destroy)(C);CHKERRQ(ierr); 245 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 246 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 247 ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr); 248 ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr); 249 ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr); 250 } else { 251 ierr = MatDestroy(&C);CHKERRQ(ierr); 252 } 253 } 254 ierr = PetscFree(*mat);CHKERRQ(ierr); 255 PetscFunctionReturn(0); 256 } 257 258 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 259 { 260 PetscErrorCode ierr; 261 PetscInt i; 262 263 PetscFunctionBegin; 264 if (scall == MAT_INITIAL_MATRIX) { 265 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 266 } 267 268 for (i=0; i<n; i++) { 269 ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 270 } 271 PetscFunctionReturn(0); 272 } 273 274 /* -------------------------------------------------------*/ 275 /* Should check that shapes of vectors and matrices match */ 276 /* -------------------------------------------------------*/ 277 278 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 279 { 280 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 281 PetscScalar *z,sum; 282 const PetscScalar *x; 283 const MatScalar *v; 284 PetscErrorCode ierr; 285 PetscInt mbs,i,n; 286 const PetscInt *idx,*ii,*ridx=NULL; 287 PetscBool usecprow=a->compressedrow.use; 288 289 PetscFunctionBegin; 290 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 291 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 292 293 if (usecprow) { 294 mbs = a->compressedrow.nrows; 295 ii = a->compressedrow.i; 296 ridx = a->compressedrow.rindex; 297 ierr = PetscMemzero(z,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 298 } else { 299 mbs = a->mbs; 300 ii = a->i; 301 } 302 303 for (i=0; i<mbs; i++) { 304 n = ii[1] - ii[0]; 305 v = a->a + ii[0]; 306 idx = a->j + ii[0]; 307 ii++; 308 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 309 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 310 sum = 0.0; 311 PetscSparseDensePlusDot(sum,x,v,idx,n); 312 if (usecprow) { 313 z[ridx[i]] = sum; 314 } else { 315 z[i] = sum; 316 } 317 } 318 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 319 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 320 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 321 PetscFunctionReturn(0); 322 } 323 324 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 325 { 326 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 327 PetscScalar *z = 0,sum1,sum2,*zarray; 328 const PetscScalar *x,*xb; 329 PetscScalar x1,x2; 330 const MatScalar *v; 331 PetscErrorCode ierr; 332 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 333 PetscBool usecprow=a->compressedrow.use; 334 335 PetscFunctionBegin; 336 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 337 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 338 339 idx = a->j; 340 v = a->a; 341 if (usecprow) { 342 mbs = a->compressedrow.nrows; 343 ii = a->compressedrow.i; 344 ridx = a->compressedrow.rindex; 345 ierr = PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 346 } else { 347 mbs = a->mbs; 348 ii = a->i; 349 z = zarray; 350 } 351 352 for (i=0; i<mbs; i++) { 353 n = ii[1] - ii[0]; ii++; 354 sum1 = 0.0; sum2 = 0.0; 355 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 356 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 357 for (j=0; j<n; j++) { 358 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 359 sum1 += v[0]*x1 + v[2]*x2; 360 sum2 += v[1]*x1 + v[3]*x2; 361 v += 4; 362 } 363 if (usecprow) z = zarray + 2*ridx[i]; 364 z[0] = sum1; z[1] = sum2; 365 if (!usecprow) z += 2; 366 } 367 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 368 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 369 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 370 PetscFunctionReturn(0); 371 } 372 373 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 374 { 375 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 376 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 377 const PetscScalar *x,*xb; 378 const MatScalar *v; 379 PetscErrorCode ierr; 380 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 381 PetscBool usecprow=a->compressedrow.use; 382 383 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 384 #pragma disjoint(*v,*z,*xb) 385 #endif 386 387 PetscFunctionBegin; 388 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 389 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 390 391 idx = a->j; 392 v = a->a; 393 if (usecprow) { 394 mbs = a->compressedrow.nrows; 395 ii = a->compressedrow.i; 396 ridx = a->compressedrow.rindex; 397 ierr = PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 398 } else { 399 mbs = a->mbs; 400 ii = a->i; 401 z = zarray; 402 } 403 404 for (i=0; i<mbs; i++) { 405 n = ii[1] - ii[0]; ii++; 406 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 407 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 408 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 409 for (j=0; j<n; j++) { 410 xb = x + 3*(*idx++); 411 x1 = xb[0]; 412 x2 = xb[1]; 413 x3 = xb[2]; 414 415 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 416 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 417 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 418 v += 9; 419 } 420 if (usecprow) z = zarray + 3*ridx[i]; 421 z[0] = sum1; z[1] = sum2; z[2] = sum3; 422 if (!usecprow) z += 3; 423 } 424 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 425 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 426 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 427 PetscFunctionReturn(0); 428 } 429 430 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 431 { 432 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 433 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 434 const PetscScalar *x,*xb; 435 const MatScalar *v; 436 PetscErrorCode ierr; 437 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 438 PetscBool usecprow=a->compressedrow.use; 439 440 PetscFunctionBegin; 441 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 442 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 443 444 idx = a->j; 445 v = a->a; 446 if (usecprow) { 447 mbs = a->compressedrow.nrows; 448 ii = a->compressedrow.i; 449 ridx = a->compressedrow.rindex; 450 ierr = PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 451 } else { 452 mbs = a->mbs; 453 ii = a->i; 454 z = zarray; 455 } 456 457 for (i=0; i<mbs; i++) { 458 n = ii[1] - ii[0]; 459 ii++; 460 sum1 = 0.0; 461 sum2 = 0.0; 462 sum3 = 0.0; 463 sum4 = 0.0; 464 465 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 466 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 467 for (j=0; j<n; j++) { 468 xb = x + 4*(*idx++); 469 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 470 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 471 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 472 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 473 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 474 v += 16; 475 } 476 if (usecprow) z = zarray + 4*ridx[i]; 477 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 478 if (!usecprow) z += 4; 479 } 480 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 481 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 482 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 483 PetscFunctionReturn(0); 484 } 485 486 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 487 { 488 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 489 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 490 const PetscScalar *xb,*x; 491 const MatScalar *v; 492 PetscErrorCode ierr; 493 const PetscInt *idx,*ii,*ridx=NULL; 494 PetscInt mbs,i,j,n; 495 PetscBool usecprow=a->compressedrow.use; 496 497 PetscFunctionBegin; 498 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 499 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 500 501 idx = a->j; 502 v = a->a; 503 if (usecprow) { 504 mbs = a->compressedrow.nrows; 505 ii = a->compressedrow.i; 506 ridx = a->compressedrow.rindex; 507 ierr = PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 508 } else { 509 mbs = a->mbs; 510 ii = a->i; 511 z = zarray; 512 } 513 514 for (i=0; i<mbs; i++) { 515 n = ii[1] - ii[0]; ii++; 516 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 517 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 518 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 519 for (j=0; j<n; j++) { 520 xb = x + 5*(*idx++); 521 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 522 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 523 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 524 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 525 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 526 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 527 v += 25; 528 } 529 if (usecprow) z = zarray + 5*ridx[i]; 530 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 531 if (!usecprow) z += 5; 532 } 533 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 534 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 535 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 536 PetscFunctionReturn(0); 537 } 538 539 540 541 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 542 { 543 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 544 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 545 const PetscScalar *x,*xb; 546 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 547 const MatScalar *v; 548 PetscErrorCode ierr; 549 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 550 PetscBool usecprow=a->compressedrow.use; 551 552 PetscFunctionBegin; 553 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 554 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 555 556 idx = a->j; 557 v = a->a; 558 if (usecprow) { 559 mbs = a->compressedrow.nrows; 560 ii = a->compressedrow.i; 561 ridx = a->compressedrow.rindex; 562 ierr = PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 563 } else { 564 mbs = a->mbs; 565 ii = a->i; 566 z = zarray; 567 } 568 569 for (i=0; i<mbs; i++) { 570 n = ii[1] - ii[0]; 571 ii++; 572 sum1 = 0.0; 573 sum2 = 0.0; 574 sum3 = 0.0; 575 sum4 = 0.0; 576 sum5 = 0.0; 577 sum6 = 0.0; 578 579 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 580 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 581 for (j=0; j<n; j++) { 582 xb = x + 6*(*idx++); 583 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 584 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 585 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 586 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 587 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 588 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 589 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 590 v += 36; 591 } 592 if (usecprow) z = zarray + 6*ridx[i]; 593 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 594 if (!usecprow) z += 6; 595 } 596 597 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 598 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 599 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 600 PetscFunctionReturn(0); 601 } 602 603 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 604 { 605 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 606 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 607 const PetscScalar *x,*xb; 608 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 609 const MatScalar *v; 610 PetscErrorCode ierr; 611 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 612 PetscBool usecprow=a->compressedrow.use; 613 614 PetscFunctionBegin; 615 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 616 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 617 618 idx = a->j; 619 v = a->a; 620 if (usecprow) { 621 mbs = a->compressedrow.nrows; 622 ii = a->compressedrow.i; 623 ridx = a->compressedrow.rindex; 624 ierr = PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 625 } else { 626 mbs = a->mbs; 627 ii = a->i; 628 z = zarray; 629 } 630 631 for (i=0; i<mbs; i++) { 632 n = ii[1] - ii[0]; 633 ii++; 634 sum1 = 0.0; 635 sum2 = 0.0; 636 sum3 = 0.0; 637 sum4 = 0.0; 638 sum5 = 0.0; 639 sum6 = 0.0; 640 sum7 = 0.0; 641 642 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 643 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 644 for (j=0; j<n; j++) { 645 xb = x + 7*(*idx++); 646 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 647 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 648 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 649 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 650 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 651 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 652 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 653 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 654 v += 49; 655 } 656 if (usecprow) z = zarray + 7*ridx[i]; 657 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 658 if (!usecprow) z += 7; 659 } 660 661 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 662 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 663 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 664 PetscFunctionReturn(0); 665 } 666 667 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 668 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz) 669 { 670 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 671 PetscScalar *z = 0,*work,*workt,*zarray; 672 const PetscScalar *x,*xb; 673 const MatScalar *v; 674 PetscErrorCode ierr; 675 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 676 const PetscInt *idx,*ii,*ridx=NULL; 677 PetscInt k; 678 PetscBool usecprow=a->compressedrow.use; 679 680 __m256d a0,a1,a2,a3,a4,a5; 681 __m256d w0,w1,w2,w3; 682 __m256d z0,z1,z2; 683 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 684 685 PetscFunctionBegin; 686 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 687 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 688 689 idx = a->j; 690 v = a->a; 691 if (usecprow) { 692 mbs = a->compressedrow.nrows; 693 ii = a->compressedrow.i; 694 ridx = a->compressedrow.rindex; 695 ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 696 } else { 697 mbs = a->mbs; 698 ii = a->i; 699 z = zarray; 700 } 701 702 if (!a->mult_work) { 703 k = PetscMax(A->rmap->n,A->cmap->n); 704 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 705 } 706 707 work = a->mult_work; 708 for (i=0; i<mbs; i++) { 709 n = ii[1] - ii[0]; ii++; 710 workt = work; 711 for (j=0; j<n; j++) { 712 xb = x + bs*(*idx++); 713 for (k=0; k<bs; k++) workt[k] = xb[k]; 714 workt += bs; 715 } 716 if (usecprow) z = zarray + bs*ridx[i]; 717 718 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 719 720 for (j=0; j<n; j++) { 721 // first column of a 722 w0 = _mm256_set1_pd(work[j*9 ]); 723 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 724 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 725 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 726 727 // second column of a 728 w1 = _mm256_set1_pd(work[j*9+ 1]); 729 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 730 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 731 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 732 733 // third column of a 734 w2 = _mm256_set1_pd(work[j*9 +2]); 735 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 736 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 737 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 738 739 // fourth column of a 740 w3 = _mm256_set1_pd(work[j*9+ 3]); 741 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 742 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 743 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 744 745 // fifth column of a 746 w0 = _mm256_set1_pd(work[j*9+ 4]); 747 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 748 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 749 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 750 751 // sixth column of a 752 w1 = _mm256_set1_pd(work[j*9+ 5]); 753 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 754 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 755 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 756 757 // seventh column of a 758 w2 = _mm256_set1_pd(work[j*9+ 6]); 759 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 760 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 761 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 762 763 // eigth column of a 764 w3 = _mm256_set1_pd(work[j*9+ 7]); 765 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 766 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 767 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 768 769 // ninth column of a 770 w0 = _mm256_set1_pd(work[j*9+ 8]); 771 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 772 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 773 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 774 } 775 776 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 777 778 v += n*bs2; 779 if (!usecprow) z += bs; 780 } 781 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 782 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 783 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 784 PetscFunctionReturn(0); 785 } 786 #endif 787 788 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz) 789 { 790 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 791 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 792 const PetscScalar *x,*xb; 793 PetscScalar *zarray,xv; 794 const MatScalar *v; 795 PetscErrorCode ierr; 796 const PetscInt *ii,*ij=a->j,*idx; 797 PetscInt mbs,i,j,k,n,*ridx=NULL; 798 PetscBool usecprow=a->compressedrow.use; 799 800 PetscFunctionBegin; 801 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 802 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 803 804 v = a->a; 805 if (usecprow) { 806 mbs = a->compressedrow.nrows; 807 ii = a->compressedrow.i; 808 ridx = a->compressedrow.rindex; 809 ierr = PetscMemzero(zarray,11*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 810 } else { 811 mbs = a->mbs; 812 ii = a->i; 813 z = zarray; 814 } 815 816 for (i=0; i<mbs; i++) { 817 n = ii[i+1] - ii[i]; 818 idx = ij + ii[i]; 819 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 820 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; 821 822 for (j=0; j<n; j++) { 823 xb = x + 11*(idx[j]); 824 825 for (k=0; k<11; k++) { 826 xv = xb[k]; 827 sum1 += v[0]*xv; 828 sum2 += v[1]*xv; 829 sum3 += v[2]*xv; 830 sum4 += v[3]*xv; 831 sum5 += v[4]*xv; 832 sum6 += v[5]*xv; 833 sum7 += v[6]*xv; 834 sum8 += v[7]*xv; 835 sum9 += v[8]*xv; 836 sum10 += v[9]*xv; 837 sum11 += v[10]*xv; 838 v += 11; 839 } 840 } 841 if (usecprow) z = zarray + 11*ridx[i]; 842 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 843 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 844 845 if (!usecprow) z += 11; 846 } 847 848 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 849 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 850 ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr); 851 PetscFunctionReturn(0); 852 } 853 854 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 855 /* Default MatMult for block size 15 */ 856 857 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 858 { 859 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 860 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 861 const PetscScalar *x,*xb; 862 PetscScalar *zarray,xv; 863 const MatScalar *v; 864 PetscErrorCode ierr; 865 const PetscInt *ii,*ij=a->j,*idx; 866 PetscInt mbs,i,j,k,n,*ridx=NULL; 867 PetscBool usecprow=a->compressedrow.use; 868 869 PetscFunctionBegin; 870 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 871 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 872 873 v = a->a; 874 if (usecprow) { 875 mbs = a->compressedrow.nrows; 876 ii = a->compressedrow.i; 877 ridx = a->compressedrow.rindex; 878 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 879 } else { 880 mbs = a->mbs; 881 ii = a->i; 882 z = zarray; 883 } 884 885 for (i=0; i<mbs; i++) { 886 n = ii[i+1] - ii[i]; 887 idx = ij + ii[i]; 888 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 889 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 890 891 for (j=0; j<n; j++) { 892 xb = x + 15*(idx[j]); 893 894 for (k=0; k<15; k++) { 895 xv = xb[k]; 896 sum1 += v[0]*xv; 897 sum2 += v[1]*xv; 898 sum3 += v[2]*xv; 899 sum4 += v[3]*xv; 900 sum5 += v[4]*xv; 901 sum6 += v[5]*xv; 902 sum7 += v[6]*xv; 903 sum8 += v[7]*xv; 904 sum9 += v[8]*xv; 905 sum10 += v[9]*xv; 906 sum11 += v[10]*xv; 907 sum12 += v[11]*xv; 908 sum13 += v[12]*xv; 909 sum14 += v[13]*xv; 910 sum15 += v[14]*xv; 911 v += 15; 912 } 913 } 914 if (usecprow) z = zarray + 15*ridx[i]; 915 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 916 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 917 918 if (!usecprow) z += 15; 919 } 920 921 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 922 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 923 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 924 PetscFunctionReturn(0); 925 } 926 927 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 928 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 929 { 930 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 931 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 932 const PetscScalar *x,*xb; 933 PetscScalar x1,x2,x3,x4,*zarray; 934 const MatScalar *v; 935 PetscErrorCode ierr; 936 const PetscInt *ii,*ij=a->j,*idx; 937 PetscInt mbs,i,j,n,*ridx=NULL; 938 PetscBool usecprow=a->compressedrow.use; 939 940 PetscFunctionBegin; 941 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 942 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 943 944 v = a->a; 945 if (usecprow) { 946 mbs = a->compressedrow.nrows; 947 ii = a->compressedrow.i; 948 ridx = a->compressedrow.rindex; 949 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 950 } else { 951 mbs = a->mbs; 952 ii = a->i; 953 z = zarray; 954 } 955 956 for (i=0; i<mbs; i++) { 957 n = ii[i+1] - ii[i]; 958 idx = ij + ii[i]; 959 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 960 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 961 962 for (j=0; j<n; j++) { 963 xb = x + 15*(idx[j]); 964 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 965 966 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 967 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 968 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 969 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 970 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 971 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 972 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 973 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 974 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 975 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 976 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 977 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 978 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 979 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 980 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 981 982 v += 60; 983 984 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 985 986 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 987 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 988 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 989 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 990 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 991 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 992 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 993 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 994 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 995 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 996 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 997 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 998 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 999 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1000 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1001 v += 60; 1002 1003 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1004 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1005 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1006 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1007 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1008 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1009 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1010 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1011 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1012 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1013 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1014 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1015 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1016 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1017 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1018 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1019 v += 60; 1020 1021 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1022 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1023 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1024 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1025 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1026 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1027 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1028 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1029 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1030 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1031 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1032 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1033 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1034 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1035 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1036 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1037 v += 45; 1038 } 1039 if (usecprow) z = zarray + 15*ridx[i]; 1040 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1041 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1042 1043 if (!usecprow) z += 15; 1044 } 1045 1046 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1047 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1048 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1049 PetscFunctionReturn(0); 1050 } 1051 1052 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1053 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1054 { 1055 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1056 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1057 const PetscScalar *x,*xb; 1058 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1059 const MatScalar *v; 1060 PetscErrorCode ierr; 1061 const PetscInt *ii,*ij=a->j,*idx; 1062 PetscInt mbs,i,j,n,*ridx=NULL; 1063 PetscBool usecprow=a->compressedrow.use; 1064 1065 PetscFunctionBegin; 1066 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1067 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1068 1069 v = a->a; 1070 if (usecprow) { 1071 mbs = a->compressedrow.nrows; 1072 ii = a->compressedrow.i; 1073 ridx = a->compressedrow.rindex; 1074 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1075 } else { 1076 mbs = a->mbs; 1077 ii = a->i; 1078 z = zarray; 1079 } 1080 1081 for (i=0; i<mbs; i++) { 1082 n = ii[i+1] - ii[i]; 1083 idx = ij + ii[i]; 1084 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1085 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1086 1087 for (j=0; j<n; j++) { 1088 xb = x + 15*(idx[j]); 1089 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1090 x8 = xb[7]; 1091 1092 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1093 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1094 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1095 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1096 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1097 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1098 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1099 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1100 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1101 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1102 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1103 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1104 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1105 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1106 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1107 v += 120; 1108 1109 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1110 1111 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1112 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1113 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1114 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1115 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1116 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1117 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1118 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1119 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1120 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1121 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1122 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1123 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1124 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1125 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1126 v += 105; 1127 } 1128 if (usecprow) z = zarray + 15*ridx[i]; 1129 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1130 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1131 1132 if (!usecprow) z += 15; 1133 } 1134 1135 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1136 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1137 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1138 PetscFunctionReturn(0); 1139 } 1140 1141 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1142 1143 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1144 { 1145 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1146 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1147 const PetscScalar *x,*xb; 1148 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1149 const MatScalar *v; 1150 PetscErrorCode ierr; 1151 const PetscInt *ii,*ij=a->j,*idx; 1152 PetscInt mbs,i,j,n,*ridx=NULL; 1153 PetscBool usecprow=a->compressedrow.use; 1154 1155 PetscFunctionBegin; 1156 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1157 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1158 1159 v = a->a; 1160 if (usecprow) { 1161 mbs = a->compressedrow.nrows; 1162 ii = a->compressedrow.i; 1163 ridx = a->compressedrow.rindex; 1164 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1165 } else { 1166 mbs = a->mbs; 1167 ii = a->i; 1168 z = zarray; 1169 } 1170 1171 for (i=0; i<mbs; i++) { 1172 n = ii[i+1] - ii[i]; 1173 idx = ij + ii[i]; 1174 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1175 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1176 1177 for (j=0; j<n; j++) { 1178 xb = x + 15*(idx[j]); 1179 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1180 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1181 1182 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1183 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1184 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1185 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1186 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1187 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1188 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1189 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1190 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1191 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1192 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1193 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1194 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1195 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1196 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1197 v += 225; 1198 } 1199 if (usecprow) z = zarray + 15*ridx[i]; 1200 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1201 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1202 1203 if (!usecprow) z += 15; 1204 } 1205 1206 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1207 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1208 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1209 PetscFunctionReturn(0); 1210 } 1211 1212 1213 /* 1214 This will not work with MatScalar == float because it calls the BLAS 1215 */ 1216 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1217 { 1218 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1219 PetscScalar *z = 0,*work,*workt,*zarray; 1220 const PetscScalar *x,*xb; 1221 const MatScalar *v; 1222 PetscErrorCode ierr; 1223 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1224 const PetscInt *idx,*ii,*ridx=NULL; 1225 PetscInt ncols,k; 1226 PetscBool usecprow=a->compressedrow.use; 1227 1228 PetscFunctionBegin; 1229 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1230 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1231 1232 idx = a->j; 1233 v = a->a; 1234 if (usecprow) { 1235 mbs = a->compressedrow.nrows; 1236 ii = a->compressedrow.i; 1237 ridx = a->compressedrow.rindex; 1238 ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1239 } else { 1240 mbs = a->mbs; 1241 ii = a->i; 1242 z = zarray; 1243 } 1244 1245 if (!a->mult_work) { 1246 k = PetscMax(A->rmap->n,A->cmap->n); 1247 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1248 } 1249 work = a->mult_work; 1250 for (i=0; i<mbs; i++) { 1251 n = ii[1] - ii[0]; ii++; 1252 ncols = n*bs; 1253 workt = work; 1254 for (j=0; j<n; j++) { 1255 xb = x + bs*(*idx++); 1256 for (k=0; k<bs; k++) workt[k] = xb[k]; 1257 workt += bs; 1258 } 1259 if (usecprow) z = zarray + bs*ridx[i]; 1260 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1261 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1262 v += n*bs2; 1263 if (!usecprow) z += bs; 1264 } 1265 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1266 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1267 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1268 PetscFunctionReturn(0); 1269 } 1270 1271 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1272 { 1273 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1274 const PetscScalar *x; 1275 PetscScalar *y,*z,sum; 1276 const MatScalar *v; 1277 PetscErrorCode ierr; 1278 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1279 const PetscInt *idx,*ii; 1280 PetscBool usecprow=a->compressedrow.use; 1281 1282 PetscFunctionBegin; 1283 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1284 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1285 1286 idx = a->j; 1287 v = a->a; 1288 if (usecprow) { 1289 if (zz != yy) { 1290 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1291 } 1292 mbs = a->compressedrow.nrows; 1293 ii = a->compressedrow.i; 1294 ridx = a->compressedrow.rindex; 1295 } else { 1296 ii = a->i; 1297 } 1298 1299 for (i=0; i<mbs; i++) { 1300 n = ii[1] - ii[0]; 1301 ii++; 1302 if (!usecprow) { 1303 sum = y[i]; 1304 } else { 1305 sum = y[ridx[i]]; 1306 } 1307 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1308 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1309 PetscSparseDensePlusDot(sum,x,v,idx,n); 1310 v += n; 1311 idx += n; 1312 if (usecprow) { 1313 z[ridx[i]] = sum; 1314 } else { 1315 z[i] = sum; 1316 } 1317 } 1318 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1319 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1320 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1321 PetscFunctionReturn(0); 1322 } 1323 1324 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1325 { 1326 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1327 PetscScalar *y = 0,*z = 0,sum1,sum2; 1328 const PetscScalar *x,*xb; 1329 PetscScalar x1,x2,*yarray,*zarray; 1330 const MatScalar *v; 1331 PetscErrorCode ierr; 1332 PetscInt mbs = a->mbs,i,n,j; 1333 const PetscInt *idx,*ii,*ridx = NULL; 1334 PetscBool usecprow = a->compressedrow.use; 1335 1336 PetscFunctionBegin; 1337 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1338 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1339 1340 idx = a->j; 1341 v = a->a; 1342 if (usecprow) { 1343 if (zz != yy) { 1344 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1345 } 1346 mbs = a->compressedrow.nrows; 1347 ii = a->compressedrow.i; 1348 ridx = a->compressedrow.rindex; 1349 } else { 1350 ii = a->i; 1351 y = yarray; 1352 z = zarray; 1353 } 1354 1355 for (i=0; i<mbs; i++) { 1356 n = ii[1] - ii[0]; ii++; 1357 if (usecprow) { 1358 z = zarray + 2*ridx[i]; 1359 y = yarray + 2*ridx[i]; 1360 } 1361 sum1 = y[0]; sum2 = y[1]; 1362 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1363 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1364 for (j=0; j<n; j++) { 1365 xb = x + 2*(*idx++); 1366 x1 = xb[0]; 1367 x2 = xb[1]; 1368 1369 sum1 += v[0]*x1 + v[2]*x2; 1370 sum2 += v[1]*x1 + v[3]*x2; 1371 v += 4; 1372 } 1373 z[0] = sum1; z[1] = sum2; 1374 if (!usecprow) { 1375 z += 2; y += 2; 1376 } 1377 } 1378 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1379 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1380 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1381 PetscFunctionReturn(0); 1382 } 1383 1384 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1385 { 1386 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1387 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1388 const PetscScalar *x,*xb; 1389 const MatScalar *v; 1390 PetscErrorCode ierr; 1391 PetscInt mbs = a->mbs,i,j,n; 1392 const PetscInt *idx,*ii,*ridx = NULL; 1393 PetscBool usecprow = a->compressedrow.use; 1394 1395 PetscFunctionBegin; 1396 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1397 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1398 1399 idx = a->j; 1400 v = a->a; 1401 if (usecprow) { 1402 if (zz != yy) { 1403 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1404 } 1405 mbs = a->compressedrow.nrows; 1406 ii = a->compressedrow.i; 1407 ridx = a->compressedrow.rindex; 1408 } else { 1409 ii = a->i; 1410 y = yarray; 1411 z = zarray; 1412 } 1413 1414 for (i=0; i<mbs; i++) { 1415 n = ii[1] - ii[0]; ii++; 1416 if (usecprow) { 1417 z = zarray + 3*ridx[i]; 1418 y = yarray + 3*ridx[i]; 1419 } 1420 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1421 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1422 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1423 for (j=0; j<n; j++) { 1424 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1425 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1426 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1427 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1428 v += 9; 1429 } 1430 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1431 if (!usecprow) { 1432 z += 3; y += 3; 1433 } 1434 } 1435 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1436 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1437 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1438 PetscFunctionReturn(0); 1439 } 1440 1441 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1442 { 1443 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1444 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1445 const PetscScalar *x,*xb; 1446 const MatScalar *v; 1447 PetscErrorCode ierr; 1448 PetscInt mbs = a->mbs,i,j,n; 1449 const PetscInt *idx,*ii,*ridx=NULL; 1450 PetscBool usecprow=a->compressedrow.use; 1451 1452 PetscFunctionBegin; 1453 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1454 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1455 1456 idx = a->j; 1457 v = a->a; 1458 if (usecprow) { 1459 if (zz != yy) { 1460 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1461 } 1462 mbs = a->compressedrow.nrows; 1463 ii = a->compressedrow.i; 1464 ridx = a->compressedrow.rindex; 1465 } else { 1466 ii = a->i; 1467 y = yarray; 1468 z = zarray; 1469 } 1470 1471 for (i=0; i<mbs; i++) { 1472 n = ii[1] - ii[0]; ii++; 1473 if (usecprow) { 1474 z = zarray + 4*ridx[i]; 1475 y = yarray + 4*ridx[i]; 1476 } 1477 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1478 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1479 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1480 for (j=0; j<n; j++) { 1481 xb = x + 4*(*idx++); 1482 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1483 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1484 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1485 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1486 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1487 v += 16; 1488 } 1489 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1490 if (!usecprow) { 1491 z += 4; y += 4; 1492 } 1493 } 1494 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1495 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1496 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1497 PetscFunctionReturn(0); 1498 } 1499 1500 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1501 { 1502 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1503 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1504 const PetscScalar *x,*xb; 1505 PetscScalar *yarray,*zarray; 1506 const MatScalar *v; 1507 PetscErrorCode ierr; 1508 PetscInt mbs = a->mbs,i,j,n; 1509 const PetscInt *idx,*ii,*ridx = NULL; 1510 PetscBool usecprow=a->compressedrow.use; 1511 1512 PetscFunctionBegin; 1513 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1514 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1515 1516 idx = a->j; 1517 v = a->a; 1518 if (usecprow) { 1519 if (zz != yy) { 1520 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1521 } 1522 mbs = a->compressedrow.nrows; 1523 ii = a->compressedrow.i; 1524 ridx = a->compressedrow.rindex; 1525 } else { 1526 ii = a->i; 1527 y = yarray; 1528 z = zarray; 1529 } 1530 1531 for (i=0; i<mbs; i++) { 1532 n = ii[1] - ii[0]; ii++; 1533 if (usecprow) { 1534 z = zarray + 5*ridx[i]; 1535 y = yarray + 5*ridx[i]; 1536 } 1537 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1538 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1539 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1540 for (j=0; j<n; j++) { 1541 xb = x + 5*(*idx++); 1542 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1543 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1544 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1545 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1546 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1547 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1548 v += 25; 1549 } 1550 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1551 if (!usecprow) { 1552 z += 5; y += 5; 1553 } 1554 } 1555 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1556 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1557 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1558 PetscFunctionReturn(0); 1559 } 1560 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1561 { 1562 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1563 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1564 const PetscScalar *x,*xb; 1565 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1566 const MatScalar *v; 1567 PetscErrorCode ierr; 1568 PetscInt mbs = a->mbs,i,j,n; 1569 const PetscInt *idx,*ii,*ridx=NULL; 1570 PetscBool usecprow=a->compressedrow.use; 1571 1572 PetscFunctionBegin; 1573 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1574 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1575 1576 idx = a->j; 1577 v = a->a; 1578 if (usecprow) { 1579 if (zz != yy) { 1580 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1581 } 1582 mbs = a->compressedrow.nrows; 1583 ii = a->compressedrow.i; 1584 ridx = a->compressedrow.rindex; 1585 } else { 1586 ii = a->i; 1587 y = yarray; 1588 z = zarray; 1589 } 1590 1591 for (i=0; i<mbs; i++) { 1592 n = ii[1] - ii[0]; ii++; 1593 if (usecprow) { 1594 z = zarray + 6*ridx[i]; 1595 y = yarray + 6*ridx[i]; 1596 } 1597 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1598 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1599 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1600 for (j=0; j<n; j++) { 1601 xb = x + 6*(*idx++); 1602 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1603 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1604 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1605 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1606 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1607 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1608 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1609 v += 36; 1610 } 1611 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1612 if (!usecprow) { 1613 z += 6; y += 6; 1614 } 1615 } 1616 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1617 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1618 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1619 PetscFunctionReturn(0); 1620 } 1621 1622 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1623 { 1624 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1625 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1626 const PetscScalar *x,*xb; 1627 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1628 const MatScalar *v; 1629 PetscErrorCode ierr; 1630 PetscInt mbs = a->mbs,i,j,n; 1631 const PetscInt *idx,*ii,*ridx = NULL; 1632 PetscBool usecprow=a->compressedrow.use; 1633 1634 PetscFunctionBegin; 1635 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1636 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1637 1638 idx = a->j; 1639 v = a->a; 1640 if (usecprow) { 1641 if (zz != yy) { 1642 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1643 } 1644 mbs = a->compressedrow.nrows; 1645 ii = a->compressedrow.i; 1646 ridx = a->compressedrow.rindex; 1647 } else { 1648 ii = a->i; 1649 y = yarray; 1650 z = zarray; 1651 } 1652 1653 for (i=0; i<mbs; i++) { 1654 n = ii[1] - ii[0]; ii++; 1655 if (usecprow) { 1656 z = zarray + 7*ridx[i]; 1657 y = yarray + 7*ridx[i]; 1658 } 1659 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1660 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1661 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1662 for (j=0; j<n; j++) { 1663 xb = x + 7*(*idx++); 1664 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1665 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1666 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1667 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1668 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1669 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1670 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1671 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1672 v += 49; 1673 } 1674 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1675 if (!usecprow) { 1676 z += 7; y += 7; 1677 } 1678 } 1679 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1680 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1681 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1682 PetscFunctionReturn(0); 1683 } 1684 1685 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1686 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz) 1687 { 1688 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1689 PetscScalar *z = 0,*work,*workt,*zarray; 1690 const PetscScalar *x,*xb; 1691 const MatScalar *v; 1692 PetscErrorCode ierr; 1693 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1694 PetscInt k; 1695 PetscBool usecprow=a->compressedrow.use; 1696 const PetscInt *idx,*ii,*ridx=NULL; 1697 1698 __m256d a0,a1,a2,a3,a4,a5; 1699 __m256d w0,w1,w2,w3; 1700 __m256d z0,z1,z2; 1701 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 1702 1703 PetscFunctionBegin; 1704 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1705 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1706 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1707 1708 idx = a->j; 1709 v = a->a; 1710 if (usecprow) { 1711 mbs = a->compressedrow.nrows; 1712 ii = a->compressedrow.i; 1713 ridx = a->compressedrow.rindex; 1714 } else { 1715 mbs = a->mbs; 1716 ii = a->i; 1717 z = zarray; 1718 } 1719 1720 if (!a->mult_work) { 1721 k = PetscMax(A->rmap->n,A->cmap->n); 1722 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1723 } 1724 1725 work = a->mult_work; 1726 for (i=0; i<mbs; i++) { 1727 n = ii[1] - ii[0]; ii++; 1728 workt = work; 1729 for (j=0; j<n; j++) { 1730 xb = x + bs*(*idx++); 1731 for (k=0; k<bs; k++) workt[k] = xb[k]; 1732 workt += bs; 1733 } 1734 if (usecprow) z = zarray + bs*ridx[i]; 1735 1736 z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]); 1737 1738 for (j=0; j<n; j++) { 1739 // first column of a 1740 w0 = _mm256_set1_pd(work[j*9 ]); 1741 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 1742 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 1743 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 1744 1745 // second column of a 1746 w1 = _mm256_set1_pd(work[j*9+ 1]); 1747 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 1748 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 1749 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 1750 1751 // third column of a 1752 w2 = _mm256_set1_pd(work[j*9+ 2]); 1753 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 1754 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 1755 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 1756 1757 // fourth column of a 1758 w3 = _mm256_set1_pd(work[j*9+ 3]); 1759 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 1760 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 1761 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 1762 1763 // fifth column of a 1764 w0 = _mm256_set1_pd(work[j*9+ 4]); 1765 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 1766 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 1767 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 1768 1769 // sixth column of a 1770 w1 = _mm256_set1_pd(work[j*9+ 5]); 1771 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 1772 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 1773 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 1774 1775 // seventh column of a 1776 w2 = _mm256_set1_pd(work[j*9+ 6]); 1777 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 1778 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 1779 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 1780 1781 // eigth column of a 1782 w3 = _mm256_set1_pd(work[j*9+ 7]); 1783 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 1784 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 1785 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 1786 1787 // ninth column of a 1788 w0 = _mm256_set1_pd(work[j*9+ 8]); 1789 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 1790 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 1791 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 1792 } 1793 1794 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 1795 1796 v += n*bs2; 1797 if (!usecprow) z += bs; 1798 } 1799 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1800 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1801 ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr); 1802 PetscFunctionReturn(0); 1803 } 1804 #endif 1805 1806 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz) 1807 { 1808 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1809 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 1810 const PetscScalar *x,*xb; 1811 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray; 1812 const MatScalar *v; 1813 PetscErrorCode ierr; 1814 PetscInt mbs = a->mbs,i,j,n; 1815 const PetscInt *idx,*ii,*ridx = NULL; 1816 PetscBool usecprow=a->compressedrow.use; 1817 1818 PetscFunctionBegin; 1819 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1820 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1821 1822 idx = a->j; 1823 v = a->a; 1824 if (usecprow) { 1825 if (zz != yy) { 1826 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1827 } 1828 mbs = a->compressedrow.nrows; 1829 ii = a->compressedrow.i; 1830 ridx = a->compressedrow.rindex; 1831 } else { 1832 ii = a->i; 1833 y = yarray; 1834 z = zarray; 1835 } 1836 1837 for (i=0; i<mbs; i++) { 1838 n = ii[1] - ii[0]; ii++; 1839 if (usecprow) { 1840 z = zarray + 11*ridx[i]; 1841 y = yarray + 11*ridx[i]; 1842 } 1843 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1844 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; 1845 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1846 PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1847 for (j=0; j<n; j++) { 1848 xb = x + 11*(*idx++); 1849 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; 1850 sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11; 1851 sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11; 1852 sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11; 1853 sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11; 1854 sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11; 1855 sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11; 1856 sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11; 1857 sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11; 1858 sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11; 1859 sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11; 1860 sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11; 1861 v += 121; 1862 } 1863 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1864 z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10; 1865 if (!usecprow) { 1866 z += 11; y += 11; 1867 } 1868 } 1869 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1870 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1871 ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr); 1872 PetscFunctionReturn(0); 1873 } 1874 1875 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1876 { 1877 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1878 PetscScalar *z = 0,*work,*workt,*zarray; 1879 const PetscScalar *x,*xb; 1880 const MatScalar *v; 1881 PetscErrorCode ierr; 1882 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1883 PetscInt ncols,k; 1884 const PetscInt *ridx = NULL,*idx,*ii; 1885 PetscBool usecprow = a->compressedrow.use; 1886 1887 PetscFunctionBegin; 1888 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1889 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1890 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1891 1892 idx = a->j; 1893 v = a->a; 1894 if (usecprow) { 1895 mbs = a->compressedrow.nrows; 1896 ii = a->compressedrow.i; 1897 ridx = a->compressedrow.rindex; 1898 } else { 1899 mbs = a->mbs; 1900 ii = a->i; 1901 z = zarray; 1902 } 1903 1904 if (!a->mult_work) { 1905 k = PetscMax(A->rmap->n,A->cmap->n); 1906 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1907 } 1908 work = a->mult_work; 1909 for (i=0; i<mbs; i++) { 1910 n = ii[1] - ii[0]; ii++; 1911 ncols = n*bs; 1912 workt = work; 1913 for (j=0; j<n; j++) { 1914 xb = x + bs*(*idx++); 1915 for (k=0; k<bs; k++) workt[k] = xb[k]; 1916 workt += bs; 1917 } 1918 if (usecprow) z = zarray + bs*ridx[i]; 1919 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1920 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1921 v += n*bs2; 1922 if (!usecprow) z += bs; 1923 } 1924 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1925 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1926 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1927 PetscFunctionReturn(0); 1928 } 1929 1930 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1931 { 1932 PetscScalar zero = 0.0; 1933 PetscErrorCode ierr; 1934 1935 PetscFunctionBegin; 1936 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1937 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1938 PetscFunctionReturn(0); 1939 } 1940 1941 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1942 { 1943 PetscScalar zero = 0.0; 1944 PetscErrorCode ierr; 1945 1946 PetscFunctionBegin; 1947 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1948 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1949 PetscFunctionReturn(0); 1950 } 1951 1952 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1953 { 1954 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1955 PetscScalar *z,x1,x2,x3,x4,x5; 1956 const PetscScalar *x,*xb = NULL; 1957 const MatScalar *v; 1958 PetscErrorCode ierr; 1959 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 1960 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1961 Mat_CompressedRow cprow = a->compressedrow; 1962 PetscBool usecprow = cprow.use; 1963 1964 PetscFunctionBegin; 1965 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1966 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1967 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1968 1969 idx = a->j; 1970 v = a->a; 1971 if (usecprow) { 1972 mbs = cprow.nrows; 1973 ii = cprow.i; 1974 ridx = cprow.rindex; 1975 } else { 1976 mbs=a->mbs; 1977 ii = a->i; 1978 xb = x; 1979 } 1980 1981 switch (bs) { 1982 case 1: 1983 for (i=0; i<mbs; i++) { 1984 if (usecprow) xb = x + ridx[i]; 1985 x1 = xb[0]; 1986 ib = idx + ii[0]; 1987 n = ii[1] - ii[0]; ii++; 1988 for (j=0; j<n; j++) { 1989 rval = ib[j]; 1990 z[rval] += PetscConj(*v) * x1; 1991 v++; 1992 } 1993 if (!usecprow) xb++; 1994 } 1995 break; 1996 case 2: 1997 for (i=0; i<mbs; i++) { 1998 if (usecprow) xb = x + 2*ridx[i]; 1999 x1 = xb[0]; x2 = xb[1]; 2000 ib = idx + ii[0]; 2001 n = ii[1] - ii[0]; ii++; 2002 for (j=0; j<n; j++) { 2003 rval = ib[j]*2; 2004 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2005 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2006 v += 4; 2007 } 2008 if (!usecprow) xb += 2; 2009 } 2010 break; 2011 case 3: 2012 for (i=0; i<mbs; i++) { 2013 if (usecprow) xb = x + 3*ridx[i]; 2014 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2015 ib = idx + ii[0]; 2016 n = ii[1] - ii[0]; ii++; 2017 for (j=0; j<n; j++) { 2018 rval = ib[j]*3; 2019 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2020 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2021 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2022 v += 9; 2023 } 2024 if (!usecprow) xb += 3; 2025 } 2026 break; 2027 case 4: 2028 for (i=0; i<mbs; i++) { 2029 if (usecprow) xb = x + 4*ridx[i]; 2030 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2031 ib = idx + ii[0]; 2032 n = ii[1] - ii[0]; ii++; 2033 for (j=0; j<n; j++) { 2034 rval = ib[j]*4; 2035 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2036 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2037 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2038 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2039 v += 16; 2040 } 2041 if (!usecprow) xb += 4; 2042 } 2043 break; 2044 case 5: 2045 for (i=0; i<mbs; i++) { 2046 if (usecprow) xb = x + 5*ridx[i]; 2047 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2048 x4 = xb[3]; x5 = xb[4]; 2049 ib = idx + ii[0]; 2050 n = ii[1] - ii[0]; ii++; 2051 for (j=0; j<n; j++) { 2052 rval = ib[j]*5; 2053 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2054 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2055 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2056 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2057 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2058 v += 25; 2059 } 2060 if (!usecprow) xb += 5; 2061 } 2062 break; 2063 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2064 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2065 #if 0 2066 { 2067 PetscInt ncols,k,bs2=a->bs2; 2068 PetscScalar *work,*workt,zb; 2069 const PetscScalar *xtmp; 2070 if (!a->mult_work) { 2071 k = PetscMax(A->rmap->n,A->cmap->n); 2072 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2073 } 2074 work = a->mult_work; 2075 xtmp = x; 2076 for (i=0; i<mbs; i++) { 2077 n = ii[1] - ii[0]; ii++; 2078 ncols = n*bs; 2079 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2080 if (usecprow) xtmp = x + bs*ridx[i]; 2081 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2082 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2083 v += n*bs2; 2084 if (!usecprow) xtmp += bs; 2085 workt = work; 2086 for (j=0; j<n; j++) { 2087 zb = z + bs*(*idx++); 2088 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2089 workt += bs; 2090 } 2091 } 2092 } 2093 #endif 2094 } 2095 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2096 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2097 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2098 PetscFunctionReturn(0); 2099 } 2100 2101 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2102 { 2103 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2104 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2105 const PetscScalar *x,*xb = 0; 2106 const MatScalar *v; 2107 PetscErrorCode ierr; 2108 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2109 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2110 Mat_CompressedRow cprow = a->compressedrow; 2111 PetscBool usecprow=cprow.use; 2112 2113 PetscFunctionBegin; 2114 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2115 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2116 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2117 2118 idx = a->j; 2119 v = a->a; 2120 if (usecprow) { 2121 mbs = cprow.nrows; 2122 ii = cprow.i; 2123 ridx = cprow.rindex; 2124 } else { 2125 mbs=a->mbs; 2126 ii = a->i; 2127 xb = x; 2128 } 2129 2130 switch (bs) { 2131 case 1: 2132 for (i=0; i<mbs; i++) { 2133 if (usecprow) xb = x + ridx[i]; 2134 x1 = xb[0]; 2135 ib = idx + ii[0]; 2136 n = ii[1] - ii[0]; ii++; 2137 for (j=0; j<n; j++) { 2138 rval = ib[j]; 2139 z[rval] += *v * x1; 2140 v++; 2141 } 2142 if (!usecprow) xb++; 2143 } 2144 break; 2145 case 2: 2146 for (i=0; i<mbs; i++) { 2147 if (usecprow) xb = x + 2*ridx[i]; 2148 x1 = xb[0]; x2 = xb[1]; 2149 ib = idx + ii[0]; 2150 n = ii[1] - ii[0]; ii++; 2151 for (j=0; j<n; j++) { 2152 rval = ib[j]*2; 2153 z[rval++] += v[0]*x1 + v[1]*x2; 2154 z[rval++] += v[2]*x1 + v[3]*x2; 2155 v += 4; 2156 } 2157 if (!usecprow) xb += 2; 2158 } 2159 break; 2160 case 3: 2161 for (i=0; i<mbs; i++) { 2162 if (usecprow) xb = x + 3*ridx[i]; 2163 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2164 ib = idx + ii[0]; 2165 n = ii[1] - ii[0]; ii++; 2166 for (j=0; j<n; j++) { 2167 rval = ib[j]*3; 2168 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2169 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2170 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2171 v += 9; 2172 } 2173 if (!usecprow) xb += 3; 2174 } 2175 break; 2176 case 4: 2177 for (i=0; i<mbs; i++) { 2178 if (usecprow) xb = x + 4*ridx[i]; 2179 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2180 ib = idx + ii[0]; 2181 n = ii[1] - ii[0]; ii++; 2182 for (j=0; j<n; j++) { 2183 rval = ib[j]*4; 2184 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2185 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2186 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2187 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2188 v += 16; 2189 } 2190 if (!usecprow) xb += 4; 2191 } 2192 break; 2193 case 5: 2194 for (i=0; i<mbs; i++) { 2195 if (usecprow) xb = x + 5*ridx[i]; 2196 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2197 x4 = xb[3]; x5 = xb[4]; 2198 ib = idx + ii[0]; 2199 n = ii[1] - ii[0]; ii++; 2200 for (j=0; j<n; j++) { 2201 rval = ib[j]*5; 2202 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2203 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2204 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2205 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2206 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2207 v += 25; 2208 } 2209 if (!usecprow) xb += 5; 2210 } 2211 break; 2212 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2213 PetscInt ncols,k; 2214 PetscScalar *work,*workt; 2215 const PetscScalar *xtmp; 2216 if (!a->mult_work) { 2217 k = PetscMax(A->rmap->n,A->cmap->n); 2218 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2219 } 2220 work = a->mult_work; 2221 xtmp = x; 2222 for (i=0; i<mbs; i++) { 2223 n = ii[1] - ii[0]; ii++; 2224 ncols = n*bs; 2225 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2226 if (usecprow) xtmp = x + bs*ridx[i]; 2227 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2228 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2229 v += n*bs2; 2230 if (!usecprow) xtmp += bs; 2231 workt = work; 2232 for (j=0; j<n; j++) { 2233 zb = z + bs*(*idx++); 2234 for (k=0; k<bs; k++) zb[k] += workt[k]; 2235 workt += bs; 2236 } 2237 } 2238 } 2239 } 2240 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2241 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2242 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2243 PetscFunctionReturn(0); 2244 } 2245 2246 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2247 { 2248 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2249 PetscInt totalnz = a->bs2*a->nz; 2250 PetscScalar oalpha = alpha; 2251 PetscErrorCode ierr; 2252 PetscBLASInt one = 1,tnz; 2253 2254 PetscFunctionBegin; 2255 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 2256 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2257 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 2258 PetscFunctionReturn(0); 2259 } 2260 2261 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2262 { 2263 PetscErrorCode ierr; 2264 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2265 MatScalar *v = a->a; 2266 PetscReal sum = 0.0; 2267 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2268 2269 PetscFunctionBegin; 2270 if (type == NORM_FROBENIUS) { 2271 #if defined(PETSC_USE_REAL___FP16) 2272 PetscBLASInt one = 1,cnt = bs2*nz; 2273 *norm = BLASnrm2_(&cnt,v,&one); 2274 #else 2275 for (i=0; i<bs2*nz; i++) { 2276 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2277 } 2278 #endif 2279 *norm = PetscSqrtReal(sum); 2280 ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr); 2281 } else if (type == NORM_1) { /* maximum column sum */ 2282 PetscReal *tmp; 2283 PetscInt *bcol = a->j; 2284 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 2285 for (i=0; i<nz; i++) { 2286 for (j=0; j<bs; j++) { 2287 k1 = bs*(*bcol) + j; /* column index */ 2288 for (k=0; k<bs; k++) { 2289 tmp[k1] += PetscAbsScalar(*v); v++; 2290 } 2291 } 2292 bcol++; 2293 } 2294 *norm = 0.0; 2295 for (j=0; j<A->cmap->n; j++) { 2296 if (tmp[j] > *norm) *norm = tmp[j]; 2297 } 2298 ierr = PetscFree(tmp);CHKERRQ(ierr); 2299 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2300 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2301 *norm = 0.0; 2302 for (k=0; k<bs; k++) { 2303 for (j=0; j<a->mbs; j++) { 2304 v = a->a + bs2*a->i[j] + k; 2305 sum = 0.0; 2306 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2307 for (k1=0; k1<bs; k1++) { 2308 sum += PetscAbsScalar(*v); 2309 v += bs; 2310 } 2311 } 2312 if (sum > *norm) *norm = sum; 2313 } 2314 } 2315 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2316 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2317 PetscFunctionReturn(0); 2318 } 2319 2320 2321 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2322 { 2323 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2324 PetscErrorCode ierr; 2325 2326 PetscFunctionBegin; 2327 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2328 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2329 *flg = PETSC_FALSE; 2330 PetscFunctionReturn(0); 2331 } 2332 2333 /* if the a->i are the same */ 2334 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2335 if (!*flg) PetscFunctionReturn(0); 2336 2337 /* if a->j are the same */ 2338 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2339 if (!*flg) PetscFunctionReturn(0); 2340 2341 /* if a->a are the same */ 2342 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 2343 PetscFunctionReturn(0); 2344 2345 } 2346 2347 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2348 { 2349 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2350 PetscErrorCode ierr; 2351 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2352 PetscScalar *x,zero = 0.0; 2353 MatScalar *aa,*aa_j; 2354 2355 PetscFunctionBegin; 2356 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2357 bs = A->rmap->bs; 2358 aa = a->a; 2359 ai = a->i; 2360 aj = a->j; 2361 ambs = a->mbs; 2362 bs2 = a->bs2; 2363 2364 ierr = VecSet(v,zero);CHKERRQ(ierr); 2365 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2366 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2367 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2368 for (i=0; i<ambs; i++) { 2369 for (j=ai[i]; j<ai[i+1]; j++) { 2370 if (aj[j] == i) { 2371 row = i*bs; 2372 aa_j = aa+j*bs2; 2373 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2374 break; 2375 } 2376 } 2377 } 2378 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2379 PetscFunctionReturn(0); 2380 } 2381 2382 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2383 { 2384 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2385 const PetscScalar *l,*r,*li,*ri; 2386 PetscScalar x; 2387 MatScalar *aa, *v; 2388 PetscErrorCode ierr; 2389 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2390 const PetscInt *ai,*aj; 2391 2392 PetscFunctionBegin; 2393 ai = a->i; 2394 aj = a->j; 2395 aa = a->a; 2396 m = A->rmap->n; 2397 n = A->cmap->n; 2398 bs = A->rmap->bs; 2399 mbs = a->mbs; 2400 bs2 = a->bs2; 2401 if (ll) { 2402 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2403 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2404 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2405 for (i=0; i<mbs; i++) { /* for each block row */ 2406 M = ai[i+1] - ai[i]; 2407 li = l + i*bs; 2408 v = aa + bs2*ai[i]; 2409 for (j=0; j<M; j++) { /* for each block */ 2410 for (k=0; k<bs2; k++) { 2411 (*v++) *= li[k%bs]; 2412 } 2413 } 2414 } 2415 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2416 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2417 } 2418 2419 if (rr) { 2420 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2421 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2422 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2423 for (i=0; i<mbs; i++) { /* for each block row */ 2424 iai = ai[i]; 2425 M = ai[i+1] - iai; 2426 v = aa + bs2*iai; 2427 for (j=0; j<M; j++) { /* for each block */ 2428 ri = r + bs*aj[iai+j]; 2429 for (k=0; k<bs; k++) { 2430 x = ri[k]; 2431 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2432 v += bs; 2433 } 2434 } 2435 } 2436 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2437 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2438 } 2439 PetscFunctionReturn(0); 2440 } 2441 2442 2443 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2444 { 2445 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2446 2447 PetscFunctionBegin; 2448 info->block_size = a->bs2; 2449 info->nz_allocated = a->bs2*a->maxnz; 2450 info->nz_used = a->bs2*a->nz; 2451 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2452 info->assemblies = A->num_ass; 2453 info->mallocs = A->info.mallocs; 2454 info->memory = ((PetscObject)A)->mem; 2455 if (A->factortype) { 2456 info->fill_ratio_given = A->info.fill_ratio_given; 2457 info->fill_ratio_needed = A->info.fill_ratio_needed; 2458 info->factor_mallocs = A->info.factor_mallocs; 2459 } else { 2460 info->fill_ratio_given = 0; 2461 info->fill_ratio_needed = 0; 2462 info->factor_mallocs = 0; 2463 } 2464 PetscFunctionReturn(0); 2465 } 2466 2467 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2468 { 2469 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2470 PetscErrorCode ierr; 2471 2472 PetscFunctionBegin; 2473 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2474 PetscFunctionReturn(0); 2475 } 2476