1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #if defined(PETSC_HAVE_IMMINTRIN_H) 8 #include <immintrin.h> 9 #endif 10 11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 12 { 13 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 16 const PetscInt *idx; 17 PetscInt start,end,*ai,*aj,bs,*nidx2; 18 PetscBT table; 19 20 PetscFunctionBegin; 21 m = a->mbs; 22 ai = a->i; 23 aj = a->j; 24 bs = A->rmap->bs; 25 26 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 27 28 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 29 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 30 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 31 32 for (i=0; i<is_max; i++) { 33 /* Initialise the two local arrays */ 34 isz = 0; 35 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 36 37 /* Extract the indices, assume there can be duplicate entries */ 38 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 39 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 40 41 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 42 for (j=0; j<n; ++j) { 43 ival = idx[j]/bs; /* convert the indices into block indices */ 44 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 45 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++) { /* for each overlap*/ 52 n = isz; 53 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 54 row = nidx[k]; 55 start = ai[row]; 56 end = ai[row+1]; 57 for (l = start; l<end; l++) { 58 val = aj[l]; 59 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 60 } 61 } 62 } 63 /* expand the Index Set */ 64 for (j=0; j<isz; j++) { 65 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 66 } 67 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 68 } 69 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 70 ierr = PetscFree(nidx);CHKERRQ(ierr); 71 ierr = PetscFree(nidx2);CHKERRQ(ierr); 72 PetscFunctionReturn(0); 73 } 74 75 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 76 { 77 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 78 PetscErrorCode ierr; 79 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 80 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 81 const PetscInt *irow,*icol; 82 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 83 PetscInt *aj = a->j,*ai = a->i; 84 MatScalar *mat_a; 85 Mat C; 86 PetscBool flag; 87 88 PetscFunctionBegin; 89 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 90 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 91 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 92 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 93 94 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 95 ssmap = smap; 96 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 97 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 98 /* determine lens of each row */ 99 for (i=0; i<nrows; i++) { 100 kstart = ai[irow[i]]; 101 kend = kstart + a->ilen[irow[i]]; 102 lens[i] = 0; 103 for (k=kstart; k<kend; k++) { 104 if (ssmap[aj[k]]) lens[i]++; 105 } 106 } 107 /* Create and fill new matrix */ 108 if (scall == MAT_REUSE_MATRIX) { 109 c = (Mat_SeqBAIJ*)((*B)->data); 110 111 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 112 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 113 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 114 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 115 C = *B; 116 } else { 117 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 118 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 119 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 120 ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 121 } 122 c = (Mat_SeqBAIJ*)(C->data); 123 for (i=0; i<nrows; i++) { 124 row = irow[i]; 125 kstart = ai[row]; 126 kend = kstart + a->ilen[row]; 127 mat_i = c->i[i]; 128 mat_j = c->j + mat_i; 129 mat_a = c->a + mat_i*bs2; 130 mat_ilen = c->ilen + i; 131 for (k=kstart; k<kend; k++) { 132 if ((tcol=ssmap[a->j[k]])) { 133 *mat_j++ = tcol - 1; 134 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 135 mat_a += bs2; 136 (*mat_ilen)++; 137 } 138 } 139 } 140 /* sort */ 141 { 142 MatScalar *work; 143 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 144 for (i=0; i<nrows; i++) { 145 PetscInt ilen; 146 mat_i = c->i[i]; 147 mat_j = c->j + mat_i; 148 mat_a = c->a + mat_i*bs2; 149 ilen = c->ilen[i]; 150 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 151 } 152 ierr = PetscFree(work);CHKERRQ(ierr); 153 } 154 155 /* Free work space */ 156 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 157 ierr = PetscFree(smap);CHKERRQ(ierr); 158 ierr = PetscFree(lens);CHKERRQ(ierr); 159 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 160 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 161 162 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 163 *B = C; 164 PetscFunctionReturn(0); 165 } 166 167 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 168 { 169 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 170 IS is1,is2; 171 PetscErrorCode ierr; 172 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 173 const PetscInt *irow,*icol; 174 175 PetscFunctionBegin; 176 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 177 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 178 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 179 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 180 181 /* Verify if the indices corespond to each element in a block 182 and form the IS with compressed IS */ 183 maxmnbs = PetscMax(a->mbs,a->nbs); 184 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 185 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 186 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 187 for (i=0; i<a->mbs; i++) { 188 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 189 } 190 count = 0; 191 for (i=0; i<nrows; i++) { 192 j = irow[i] / bs; 193 if ((vary[j]--)==bs) iary[count++] = j; 194 } 195 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 196 197 ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr); 198 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 199 for (i=0; i<a->nbs; i++) { 200 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 201 } 202 count = 0; 203 for (i=0; i<ncols; i++) { 204 j = icol[i] / bs; 205 if ((vary[j]--)==bs) iary[count++] = j; 206 } 207 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 208 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 209 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 210 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 211 212 ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 213 ierr = ISDestroy(&is1);CHKERRQ(ierr); 214 ierr = ISDestroy(&is2);CHKERRQ(ierr); 215 PetscFunctionReturn(0); 216 } 217 218 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 219 { 220 PetscErrorCode ierr; 221 Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data; 222 Mat_SubSppt *submatj = c->submatis1; 223 224 PetscFunctionBegin; 225 ierr = submatj->destroy(C);CHKERRQ(ierr); 226 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[]) 231 { 232 PetscErrorCode ierr; 233 PetscInt i; 234 Mat C; 235 Mat_SeqBAIJ *c; 236 Mat_SubSppt *submatj; 237 238 PetscFunctionBegin; 239 for (i=0; i<n; i++) { 240 C = (*mat)[i]; 241 c = (Mat_SeqBAIJ*)C->data; 242 submatj = c->submatis1; 243 if (submatj) { 244 ierr = submatj->destroy(C);CHKERRQ(ierr); 245 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 246 ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr); 247 ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr); 248 ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr); 249 } else { 250 ierr = MatDestroy(&C);CHKERRQ(ierr); 251 } 252 } 253 ierr = PetscFree(*mat);CHKERRQ(ierr); 254 PetscFunctionReturn(0); 255 } 256 257 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 258 { 259 PetscErrorCode ierr; 260 PetscInt i; 261 262 PetscFunctionBegin; 263 if (scall == MAT_INITIAL_MATRIX) { 264 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 265 } 266 267 for (i=0; i<n; i++) { 268 ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 269 } 270 PetscFunctionReturn(0); 271 } 272 273 /* -------------------------------------------------------*/ 274 /* Should check that shapes of vectors and matrices match */ 275 /* -------------------------------------------------------*/ 276 277 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 278 { 279 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 280 PetscScalar *z,sum; 281 const PetscScalar *x; 282 const MatScalar *v; 283 PetscErrorCode ierr; 284 PetscInt mbs,i,n; 285 const PetscInt *idx,*ii,*ridx=NULL; 286 PetscBool usecprow=a->compressedrow.use; 287 288 PetscFunctionBegin; 289 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 290 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 291 292 if (usecprow) { 293 mbs = a->compressedrow.nrows; 294 ii = a->compressedrow.i; 295 ridx = a->compressedrow.rindex; 296 ierr = PetscMemzero(z,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 297 } else { 298 mbs = a->mbs; 299 ii = a->i; 300 } 301 302 for (i=0; i<mbs; i++) { 303 n = ii[1] - ii[0]; 304 v = a->a + ii[0]; 305 idx = a->j + ii[0]; 306 ii++; 307 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 308 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 309 sum = 0.0; 310 PetscSparseDensePlusDot(sum,x,v,idx,n); 311 if (usecprow) { 312 z[ridx[i]] = sum; 313 } else { 314 z[i] = sum; 315 } 316 } 317 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 318 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 319 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 320 PetscFunctionReturn(0); 321 } 322 323 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 324 { 325 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 326 PetscScalar *z = 0,sum1,sum2,*zarray; 327 const PetscScalar *x,*xb; 328 PetscScalar x1,x2; 329 const MatScalar *v; 330 PetscErrorCode ierr; 331 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 332 PetscBool usecprow=a->compressedrow.use; 333 334 PetscFunctionBegin; 335 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 336 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 337 338 idx = a->j; 339 v = a->a; 340 if (usecprow) { 341 mbs = a->compressedrow.nrows; 342 ii = a->compressedrow.i; 343 ridx = a->compressedrow.rindex; 344 ierr = PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 345 } else { 346 mbs = a->mbs; 347 ii = a->i; 348 z = zarray; 349 } 350 351 for (i=0; i<mbs; i++) { 352 n = ii[1] - ii[0]; ii++; 353 sum1 = 0.0; sum2 = 0.0; 354 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 355 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 356 for (j=0; j<n; j++) { 357 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 358 sum1 += v[0]*x1 + v[2]*x2; 359 sum2 += v[1]*x1 + v[3]*x2; 360 v += 4; 361 } 362 if (usecprow) z = zarray + 2*ridx[i]; 363 z[0] = sum1; z[1] = sum2; 364 if (!usecprow) z += 2; 365 } 366 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 367 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 368 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 369 PetscFunctionReturn(0); 370 } 371 372 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 373 { 374 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 375 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 376 const PetscScalar *x,*xb; 377 const MatScalar *v; 378 PetscErrorCode ierr; 379 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 380 PetscBool usecprow=a->compressedrow.use; 381 382 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 383 #pragma disjoint(*v,*z,*xb) 384 #endif 385 386 PetscFunctionBegin; 387 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 388 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 389 390 idx = a->j; 391 v = a->a; 392 if (usecprow) { 393 mbs = a->compressedrow.nrows; 394 ii = a->compressedrow.i; 395 ridx = a->compressedrow.rindex; 396 ierr = PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 397 } else { 398 mbs = a->mbs; 399 ii = a->i; 400 z = zarray; 401 } 402 403 for (i=0; i<mbs; i++) { 404 n = ii[1] - ii[0]; ii++; 405 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 406 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 407 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 408 for (j=0; j<n; j++) { 409 xb = x + 3*(*idx++); 410 x1 = xb[0]; 411 x2 = xb[1]; 412 x3 = xb[2]; 413 414 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 415 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 416 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 417 v += 9; 418 } 419 if (usecprow) z = zarray + 3*ridx[i]; 420 z[0] = sum1; z[1] = sum2; z[2] = sum3; 421 if (!usecprow) z += 3; 422 } 423 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 424 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 425 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 426 PetscFunctionReturn(0); 427 } 428 429 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 430 { 431 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 432 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 433 const PetscScalar *x,*xb; 434 const MatScalar *v; 435 PetscErrorCode ierr; 436 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 437 PetscBool usecprow=a->compressedrow.use; 438 439 PetscFunctionBegin; 440 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 441 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 442 443 idx = a->j; 444 v = a->a; 445 if (usecprow) { 446 mbs = a->compressedrow.nrows; 447 ii = a->compressedrow.i; 448 ridx = a->compressedrow.rindex; 449 ierr = PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 450 } else { 451 mbs = a->mbs; 452 ii = a->i; 453 z = zarray; 454 } 455 456 for (i=0; i<mbs; i++) { 457 n = ii[1] - ii[0]; 458 ii++; 459 sum1 = 0.0; 460 sum2 = 0.0; 461 sum3 = 0.0; 462 sum4 = 0.0; 463 464 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 465 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 466 for (j=0; j<n; j++) { 467 xb = x + 4*(*idx++); 468 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 469 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 470 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 471 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 472 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 473 v += 16; 474 } 475 if (usecprow) z = zarray + 4*ridx[i]; 476 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 477 if (!usecprow) z += 4; 478 } 479 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 480 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 481 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 482 PetscFunctionReturn(0); 483 } 484 485 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 486 { 487 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 488 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 489 const PetscScalar *xb,*x; 490 const MatScalar *v; 491 PetscErrorCode ierr; 492 const PetscInt *idx,*ii,*ridx=NULL; 493 PetscInt mbs,i,j,n; 494 PetscBool usecprow=a->compressedrow.use; 495 496 PetscFunctionBegin; 497 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 498 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 499 500 idx = a->j; 501 v = a->a; 502 if (usecprow) { 503 mbs = a->compressedrow.nrows; 504 ii = a->compressedrow.i; 505 ridx = a->compressedrow.rindex; 506 ierr = PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 507 } else { 508 mbs = a->mbs; 509 ii = a->i; 510 z = zarray; 511 } 512 513 for (i=0; i<mbs; i++) { 514 n = ii[1] - ii[0]; ii++; 515 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 516 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 517 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 518 for (j=0; j<n; j++) { 519 xb = x + 5*(*idx++); 520 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 521 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 522 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 523 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 524 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 525 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 526 v += 25; 527 } 528 if (usecprow) z = zarray + 5*ridx[i]; 529 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 530 if (!usecprow) z += 5; 531 } 532 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 533 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 534 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 535 PetscFunctionReturn(0); 536 } 537 538 539 540 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 541 { 542 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 543 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 544 const PetscScalar *x,*xb; 545 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 546 const MatScalar *v; 547 PetscErrorCode ierr; 548 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 549 PetscBool usecprow=a->compressedrow.use; 550 551 PetscFunctionBegin; 552 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 553 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 554 555 idx = a->j; 556 v = a->a; 557 if (usecprow) { 558 mbs = a->compressedrow.nrows; 559 ii = a->compressedrow.i; 560 ridx = a->compressedrow.rindex; 561 ierr = PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 562 } else { 563 mbs = a->mbs; 564 ii = a->i; 565 z = zarray; 566 } 567 568 for (i=0; i<mbs; i++) { 569 n = ii[1] - ii[0]; 570 ii++; 571 sum1 = 0.0; 572 sum2 = 0.0; 573 sum3 = 0.0; 574 sum4 = 0.0; 575 sum5 = 0.0; 576 sum6 = 0.0; 577 578 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 579 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 580 for (j=0; j<n; j++) { 581 xb = x + 6*(*idx++); 582 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 583 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 584 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 585 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 586 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 587 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 588 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 589 v += 36; 590 } 591 if (usecprow) z = zarray + 6*ridx[i]; 592 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 593 if (!usecprow) z += 6; 594 } 595 596 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 597 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 598 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 599 PetscFunctionReturn(0); 600 } 601 602 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 603 { 604 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 605 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 606 const PetscScalar *x,*xb; 607 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 608 const MatScalar *v; 609 PetscErrorCode ierr; 610 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 611 PetscBool usecprow=a->compressedrow.use; 612 613 PetscFunctionBegin; 614 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 615 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 616 617 idx = a->j; 618 v = a->a; 619 if (usecprow) { 620 mbs = a->compressedrow.nrows; 621 ii = a->compressedrow.i; 622 ridx = a->compressedrow.rindex; 623 ierr = PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 624 } else { 625 mbs = a->mbs; 626 ii = a->i; 627 z = zarray; 628 } 629 630 for (i=0; i<mbs; i++) { 631 n = ii[1] - ii[0]; 632 ii++; 633 sum1 = 0.0; 634 sum2 = 0.0; 635 sum3 = 0.0; 636 sum4 = 0.0; 637 sum5 = 0.0; 638 sum6 = 0.0; 639 sum7 = 0.0; 640 641 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 642 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 643 for (j=0; j<n; j++) { 644 xb = x + 7*(*idx++); 645 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 646 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 647 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 648 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 649 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 650 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 651 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 652 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 653 v += 49; 654 } 655 if (usecprow) z = zarray + 7*ridx[i]; 656 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 657 if (!usecprow) z += 7; 658 } 659 660 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 661 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 662 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 663 PetscFunctionReturn(0); 664 } 665 666 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) 667 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz) 668 { 669 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 670 PetscScalar *z = 0,*work,*workt,*zarray; 671 const PetscScalar *x,*xb; 672 const MatScalar *v; 673 PetscErrorCode ierr; 674 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 675 const PetscInt *idx,*ii,*ridx=NULL; 676 PetscInt k; 677 PetscBool usecprow=a->compressedrow.use; 678 679 __m256d a0,a1,a2,a3,a4,a5; 680 __m256d w0,w1,w2,w3; 681 __m256d z0,z1,z2; 682 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 683 684 PetscFunctionBegin; 685 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 686 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 687 688 idx = a->j; 689 v = a->a; 690 if (usecprow) { 691 mbs = a->compressedrow.nrows; 692 ii = a->compressedrow.i; 693 ridx = a->compressedrow.rindex; 694 ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 695 } else { 696 mbs = a->mbs; 697 ii = a->i; 698 z = zarray; 699 } 700 701 if (!a->mult_work) { 702 k = PetscMax(A->rmap->n,A->cmap->n); 703 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 704 } 705 706 work = a->mult_work; 707 for (i=0; i<mbs; i++) { 708 n = ii[1] - ii[0]; ii++; 709 workt = work; 710 for (j=0; j<n; j++) { 711 xb = x + bs*(*idx++); 712 for (k=0; k<bs; k++) workt[k] = xb[k]; 713 workt += bs; 714 } 715 if (usecprow) z = zarray + bs*ridx[i]; 716 717 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 718 719 for (j=0; j<n; j++) { 720 // first column of a 721 w0 = _mm256_set1_pd(work[j*9 ]); 722 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 723 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 724 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 725 726 // second column of a 727 w1 = _mm256_set1_pd(work[j*9+ 1]); 728 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 729 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 730 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 731 732 // third column of a 733 w2 = _mm256_set1_pd(work[j*9 +2]); 734 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 735 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 736 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 737 738 // fourth column of a 739 w3 = _mm256_set1_pd(work[j*9+ 3]); 740 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 741 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 742 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 743 744 // fifth column of a 745 w0 = _mm256_set1_pd(work[j*9+ 4]); 746 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 747 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 748 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 749 750 // sixth column of a 751 w1 = _mm256_set1_pd(work[j*9+ 5]); 752 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 753 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 754 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 755 756 // seventh column of a 757 w2 = _mm256_set1_pd(work[j*9+ 6]); 758 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 759 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 760 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 761 762 // eigth column of a 763 w3 = _mm256_set1_pd(work[j*9+ 7]); 764 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 765 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 766 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 767 768 // ninth column of a 769 w0 = _mm256_set1_pd(work[j*9+ 8]); 770 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 771 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 772 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 773 } 774 775 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 776 777 v += n*bs2; 778 if (!usecprow) z += bs; 779 } 780 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 781 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 782 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 783 PetscFunctionReturn(0); 784 } 785 #endif 786 787 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz) 788 { 789 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 790 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 791 const PetscScalar *x,*xb; 792 PetscScalar *zarray,xv; 793 const MatScalar *v; 794 PetscErrorCode ierr; 795 const PetscInt *ii,*ij=a->j,*idx; 796 PetscInt mbs,i,j,k,n,*ridx=NULL; 797 PetscBool usecprow=a->compressedrow.use; 798 799 PetscFunctionBegin; 800 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 801 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 802 803 v = a->a; 804 if (usecprow) { 805 mbs = a->compressedrow.nrows; 806 ii = a->compressedrow.i; 807 ridx = a->compressedrow.rindex; 808 ierr = PetscMemzero(zarray,11*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 809 } else { 810 mbs = a->mbs; 811 ii = a->i; 812 z = zarray; 813 } 814 815 for (i=0; i<mbs; i++) { 816 n = ii[i+1] - ii[i]; 817 idx = ij + ii[i]; 818 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 819 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; 820 821 for (j=0; j<n; j++) { 822 xb = x + 11*(idx[j]); 823 824 for (k=0; k<11; k++) { 825 xv = xb[k]; 826 sum1 += v[0]*xv; 827 sum2 += v[1]*xv; 828 sum3 += v[2]*xv; 829 sum4 += v[3]*xv; 830 sum5 += v[4]*xv; 831 sum6 += v[5]*xv; 832 sum7 += v[6]*xv; 833 sum8 += v[7]*xv; 834 sum9 += v[8]*xv; 835 sum10 += v[9]*xv; 836 sum11 += v[10]*xv; 837 v += 11; 838 } 839 } 840 if (usecprow) z = zarray + 11*ridx[i]; 841 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 842 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 843 844 if (!usecprow) z += 11; 845 } 846 847 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 848 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 849 ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr); 850 PetscFunctionReturn(0); 851 } 852 853 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 854 /* Default MatMult for block size 15 */ 855 856 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 857 { 858 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 859 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 860 const PetscScalar *x,*xb; 861 PetscScalar *zarray,xv; 862 const MatScalar *v; 863 PetscErrorCode ierr; 864 const PetscInt *ii,*ij=a->j,*idx; 865 PetscInt mbs,i,j,k,n,*ridx=NULL; 866 PetscBool usecprow=a->compressedrow.use; 867 868 PetscFunctionBegin; 869 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 870 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 871 872 v = a->a; 873 if (usecprow) { 874 mbs = a->compressedrow.nrows; 875 ii = a->compressedrow.i; 876 ridx = a->compressedrow.rindex; 877 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 878 } else { 879 mbs = a->mbs; 880 ii = a->i; 881 z = zarray; 882 } 883 884 for (i=0; i<mbs; i++) { 885 n = ii[i+1] - ii[i]; 886 idx = ij + ii[i]; 887 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 888 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 889 890 for (j=0; j<n; j++) { 891 xb = x + 15*(idx[j]); 892 893 for (k=0; k<15; k++) { 894 xv = xb[k]; 895 sum1 += v[0]*xv; 896 sum2 += v[1]*xv; 897 sum3 += v[2]*xv; 898 sum4 += v[3]*xv; 899 sum5 += v[4]*xv; 900 sum6 += v[5]*xv; 901 sum7 += v[6]*xv; 902 sum8 += v[7]*xv; 903 sum9 += v[8]*xv; 904 sum10 += v[9]*xv; 905 sum11 += v[10]*xv; 906 sum12 += v[11]*xv; 907 sum13 += v[12]*xv; 908 sum14 += v[13]*xv; 909 sum15 += v[14]*xv; 910 v += 15; 911 } 912 } 913 if (usecprow) z = zarray + 15*ridx[i]; 914 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 915 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 916 917 if (!usecprow) z += 15; 918 } 919 920 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 921 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 922 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 923 PetscFunctionReturn(0); 924 } 925 926 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 927 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 928 { 929 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 930 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 931 const PetscScalar *x,*xb; 932 PetscScalar x1,x2,x3,x4,*zarray; 933 const MatScalar *v; 934 PetscErrorCode ierr; 935 const PetscInt *ii,*ij=a->j,*idx; 936 PetscInt mbs,i,j,n,*ridx=NULL; 937 PetscBool usecprow=a->compressedrow.use; 938 939 PetscFunctionBegin; 940 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 941 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 942 943 v = a->a; 944 if (usecprow) { 945 mbs = a->compressedrow.nrows; 946 ii = a->compressedrow.i; 947 ridx = a->compressedrow.rindex; 948 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 949 } else { 950 mbs = a->mbs; 951 ii = a->i; 952 z = zarray; 953 } 954 955 for (i=0; i<mbs; i++) { 956 n = ii[i+1] - ii[i]; 957 idx = ij + ii[i]; 958 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 959 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 960 961 for (j=0; j<n; j++) { 962 xb = x + 15*(idx[j]); 963 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 964 965 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 966 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 967 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 968 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 969 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 970 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 971 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 972 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 973 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 974 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 975 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 976 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 977 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 978 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 979 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 980 981 v += 60; 982 983 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 984 985 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 986 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 987 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 988 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 989 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 990 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 991 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 992 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 993 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 994 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 995 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 996 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 997 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 998 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 999 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1000 v += 60; 1001 1002 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1003 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1004 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1005 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1006 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1007 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1008 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1009 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1010 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1011 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1012 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1013 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1014 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1015 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1016 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1017 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1018 v += 60; 1019 1020 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1021 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1022 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1023 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1024 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1025 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1026 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1027 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1028 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1029 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1030 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1031 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1032 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1033 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1034 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1035 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1036 v += 45; 1037 } 1038 if (usecprow) z = zarray + 15*ridx[i]; 1039 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1040 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1041 1042 if (!usecprow) z += 15; 1043 } 1044 1045 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1046 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1047 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1048 PetscFunctionReturn(0); 1049 } 1050 1051 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1052 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1053 { 1054 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1055 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1056 const PetscScalar *x,*xb; 1057 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1058 const MatScalar *v; 1059 PetscErrorCode ierr; 1060 const PetscInt *ii,*ij=a->j,*idx; 1061 PetscInt mbs,i,j,n,*ridx=NULL; 1062 PetscBool usecprow=a->compressedrow.use; 1063 1064 PetscFunctionBegin; 1065 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1066 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1067 1068 v = a->a; 1069 if (usecprow) { 1070 mbs = a->compressedrow.nrows; 1071 ii = a->compressedrow.i; 1072 ridx = a->compressedrow.rindex; 1073 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1074 } else { 1075 mbs = a->mbs; 1076 ii = a->i; 1077 z = zarray; 1078 } 1079 1080 for (i=0; i<mbs; i++) { 1081 n = ii[i+1] - ii[i]; 1082 idx = ij + ii[i]; 1083 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1084 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1085 1086 for (j=0; j<n; j++) { 1087 xb = x + 15*(idx[j]); 1088 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1089 x8 = xb[7]; 1090 1091 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1092 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1093 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1094 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1095 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1096 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1097 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1098 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1099 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1100 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1101 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1102 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1103 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1104 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1105 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1106 v += 120; 1107 1108 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1109 1110 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1111 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1112 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1113 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1114 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1115 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1116 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1117 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1118 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1119 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1120 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1121 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1122 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1123 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1124 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1125 v += 105; 1126 } 1127 if (usecprow) z = zarray + 15*ridx[i]; 1128 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1129 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1130 1131 if (!usecprow) z += 15; 1132 } 1133 1134 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1135 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1136 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1137 PetscFunctionReturn(0); 1138 } 1139 1140 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1141 1142 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1143 { 1144 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1145 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1146 const PetscScalar *x,*xb; 1147 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1148 const MatScalar *v; 1149 PetscErrorCode ierr; 1150 const PetscInt *ii,*ij=a->j,*idx; 1151 PetscInt mbs,i,j,n,*ridx=NULL; 1152 PetscBool usecprow=a->compressedrow.use; 1153 1154 PetscFunctionBegin; 1155 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1156 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1157 1158 v = a->a; 1159 if (usecprow) { 1160 mbs = a->compressedrow.nrows; 1161 ii = a->compressedrow.i; 1162 ridx = a->compressedrow.rindex; 1163 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1164 } else { 1165 mbs = a->mbs; 1166 ii = a->i; 1167 z = zarray; 1168 } 1169 1170 for (i=0; i<mbs; i++) { 1171 n = ii[i+1] - ii[i]; 1172 idx = ij + ii[i]; 1173 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1174 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1175 1176 for (j=0; j<n; j++) { 1177 xb = x + 15*(idx[j]); 1178 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1179 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1180 1181 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1182 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1183 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1184 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1185 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1186 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1187 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1188 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1189 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1190 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1191 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1192 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1193 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1194 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1195 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1196 v += 225; 1197 } 1198 if (usecprow) z = zarray + 15*ridx[i]; 1199 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1200 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1201 1202 if (!usecprow) z += 15; 1203 } 1204 1205 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1206 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1207 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1208 PetscFunctionReturn(0); 1209 } 1210 1211 1212 /* 1213 This will not work with MatScalar == float because it calls the BLAS 1214 */ 1215 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1216 { 1217 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1218 PetscScalar *z = 0,*work,*workt,*zarray; 1219 const PetscScalar *x,*xb; 1220 const MatScalar *v; 1221 PetscErrorCode ierr; 1222 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1223 const PetscInt *idx,*ii,*ridx=NULL; 1224 PetscInt ncols,k; 1225 PetscBool usecprow=a->compressedrow.use; 1226 1227 PetscFunctionBegin; 1228 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1229 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1230 1231 idx = a->j; 1232 v = a->a; 1233 if (usecprow) { 1234 mbs = a->compressedrow.nrows; 1235 ii = a->compressedrow.i; 1236 ridx = a->compressedrow.rindex; 1237 ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1238 } else { 1239 mbs = a->mbs; 1240 ii = a->i; 1241 z = zarray; 1242 } 1243 1244 if (!a->mult_work) { 1245 k = PetscMax(A->rmap->n,A->cmap->n); 1246 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1247 } 1248 work = a->mult_work; 1249 for (i=0; i<mbs; i++) { 1250 n = ii[1] - ii[0]; ii++; 1251 ncols = n*bs; 1252 workt = work; 1253 for (j=0; j<n; j++) { 1254 xb = x + bs*(*idx++); 1255 for (k=0; k<bs; k++) workt[k] = xb[k]; 1256 workt += bs; 1257 } 1258 if (usecprow) z = zarray + bs*ridx[i]; 1259 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1260 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1261 v += n*bs2; 1262 if (!usecprow) z += bs; 1263 } 1264 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1265 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1266 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1267 PetscFunctionReturn(0); 1268 } 1269 1270 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1271 { 1272 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1273 const PetscScalar *x; 1274 PetscScalar *y,*z,sum; 1275 const MatScalar *v; 1276 PetscErrorCode ierr; 1277 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1278 const PetscInt *idx,*ii; 1279 PetscBool usecprow=a->compressedrow.use; 1280 1281 PetscFunctionBegin; 1282 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1283 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1284 1285 idx = a->j; 1286 v = a->a; 1287 if (usecprow) { 1288 if (zz != yy) { 1289 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1290 } 1291 mbs = a->compressedrow.nrows; 1292 ii = a->compressedrow.i; 1293 ridx = a->compressedrow.rindex; 1294 } else { 1295 ii = a->i; 1296 } 1297 1298 for (i=0; i<mbs; i++) { 1299 n = ii[1] - ii[0]; 1300 ii++; 1301 if (!usecprow) { 1302 sum = y[i]; 1303 } else { 1304 sum = y[ridx[i]]; 1305 } 1306 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1307 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1308 PetscSparseDensePlusDot(sum,x,v,idx,n); 1309 v += n; 1310 idx += n; 1311 if (usecprow) { 1312 z[ridx[i]] = sum; 1313 } else { 1314 z[i] = sum; 1315 } 1316 } 1317 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1318 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1319 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1320 PetscFunctionReturn(0); 1321 } 1322 1323 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1324 { 1325 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1326 PetscScalar *y = 0,*z = 0,sum1,sum2; 1327 const PetscScalar *x,*xb; 1328 PetscScalar x1,x2,*yarray,*zarray; 1329 const MatScalar *v; 1330 PetscErrorCode ierr; 1331 PetscInt mbs = a->mbs,i,n,j; 1332 const PetscInt *idx,*ii,*ridx = NULL; 1333 PetscBool usecprow = a->compressedrow.use; 1334 1335 PetscFunctionBegin; 1336 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1337 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1338 1339 idx = a->j; 1340 v = a->a; 1341 if (usecprow) { 1342 if (zz != yy) { 1343 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1344 } 1345 mbs = a->compressedrow.nrows; 1346 ii = a->compressedrow.i; 1347 ridx = a->compressedrow.rindex; 1348 } else { 1349 ii = a->i; 1350 y = yarray; 1351 z = zarray; 1352 } 1353 1354 for (i=0; i<mbs; i++) { 1355 n = ii[1] - ii[0]; ii++; 1356 if (usecprow) { 1357 z = zarray + 2*ridx[i]; 1358 y = yarray + 2*ridx[i]; 1359 } 1360 sum1 = y[0]; sum2 = y[1]; 1361 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1362 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1363 for (j=0; j<n; j++) { 1364 xb = x + 2*(*idx++); 1365 x1 = xb[0]; 1366 x2 = xb[1]; 1367 1368 sum1 += v[0]*x1 + v[2]*x2; 1369 sum2 += v[1]*x1 + v[3]*x2; 1370 v += 4; 1371 } 1372 z[0] = sum1; z[1] = sum2; 1373 if (!usecprow) { 1374 z += 2; y += 2; 1375 } 1376 } 1377 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1378 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1379 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1380 PetscFunctionReturn(0); 1381 } 1382 1383 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1384 { 1385 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1386 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1387 const PetscScalar *x,*xb; 1388 const MatScalar *v; 1389 PetscErrorCode ierr; 1390 PetscInt mbs = a->mbs,i,j,n; 1391 const PetscInt *idx,*ii,*ridx = NULL; 1392 PetscBool usecprow = a->compressedrow.use; 1393 1394 PetscFunctionBegin; 1395 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1396 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1397 1398 idx = a->j; 1399 v = a->a; 1400 if (usecprow) { 1401 if (zz != yy) { 1402 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1403 } 1404 mbs = a->compressedrow.nrows; 1405 ii = a->compressedrow.i; 1406 ridx = a->compressedrow.rindex; 1407 } else { 1408 ii = a->i; 1409 y = yarray; 1410 z = zarray; 1411 } 1412 1413 for (i=0; i<mbs; i++) { 1414 n = ii[1] - ii[0]; ii++; 1415 if (usecprow) { 1416 z = zarray + 3*ridx[i]; 1417 y = yarray + 3*ridx[i]; 1418 } 1419 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1420 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1421 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1422 for (j=0; j<n; j++) { 1423 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1424 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1425 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1426 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1427 v += 9; 1428 } 1429 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1430 if (!usecprow) { 1431 z += 3; y += 3; 1432 } 1433 } 1434 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1435 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1436 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1437 PetscFunctionReturn(0); 1438 } 1439 1440 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1441 { 1442 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1443 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1444 const PetscScalar *x,*xb; 1445 const MatScalar *v; 1446 PetscErrorCode ierr; 1447 PetscInt mbs = a->mbs,i,j,n; 1448 const PetscInt *idx,*ii,*ridx=NULL; 1449 PetscBool usecprow=a->compressedrow.use; 1450 1451 PetscFunctionBegin; 1452 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1453 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1454 1455 idx = a->j; 1456 v = a->a; 1457 if (usecprow) { 1458 if (zz != yy) { 1459 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1460 } 1461 mbs = a->compressedrow.nrows; 1462 ii = a->compressedrow.i; 1463 ridx = a->compressedrow.rindex; 1464 } else { 1465 ii = a->i; 1466 y = yarray; 1467 z = zarray; 1468 } 1469 1470 for (i=0; i<mbs; i++) { 1471 n = ii[1] - ii[0]; ii++; 1472 if (usecprow) { 1473 z = zarray + 4*ridx[i]; 1474 y = yarray + 4*ridx[i]; 1475 } 1476 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1477 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1478 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1479 for (j=0; j<n; j++) { 1480 xb = x + 4*(*idx++); 1481 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1482 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1483 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1484 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1485 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1486 v += 16; 1487 } 1488 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1489 if (!usecprow) { 1490 z += 4; y += 4; 1491 } 1492 } 1493 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1494 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1495 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1496 PetscFunctionReturn(0); 1497 } 1498 1499 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1500 { 1501 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1502 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1503 const PetscScalar *x,*xb; 1504 PetscScalar *yarray,*zarray; 1505 const MatScalar *v; 1506 PetscErrorCode ierr; 1507 PetscInt mbs = a->mbs,i,j,n; 1508 const PetscInt *idx,*ii,*ridx = NULL; 1509 PetscBool usecprow=a->compressedrow.use; 1510 1511 PetscFunctionBegin; 1512 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1513 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1514 1515 idx = a->j; 1516 v = a->a; 1517 if (usecprow) { 1518 if (zz != yy) { 1519 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1520 } 1521 mbs = a->compressedrow.nrows; 1522 ii = a->compressedrow.i; 1523 ridx = a->compressedrow.rindex; 1524 } else { 1525 ii = a->i; 1526 y = yarray; 1527 z = zarray; 1528 } 1529 1530 for (i=0; i<mbs; i++) { 1531 n = ii[1] - ii[0]; ii++; 1532 if (usecprow) { 1533 z = zarray + 5*ridx[i]; 1534 y = yarray + 5*ridx[i]; 1535 } 1536 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1537 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1538 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1539 for (j=0; j<n; j++) { 1540 xb = x + 5*(*idx++); 1541 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1542 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1543 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1544 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1545 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1546 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1547 v += 25; 1548 } 1549 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1550 if (!usecprow) { 1551 z += 5; y += 5; 1552 } 1553 } 1554 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1555 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1556 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1557 PetscFunctionReturn(0); 1558 } 1559 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1560 { 1561 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1562 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1563 const PetscScalar *x,*xb; 1564 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1565 const MatScalar *v; 1566 PetscErrorCode ierr; 1567 PetscInt mbs = a->mbs,i,j,n; 1568 const PetscInt *idx,*ii,*ridx=NULL; 1569 PetscBool usecprow=a->compressedrow.use; 1570 1571 PetscFunctionBegin; 1572 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1573 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1574 1575 idx = a->j; 1576 v = a->a; 1577 if (usecprow) { 1578 if (zz != yy) { 1579 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1580 } 1581 mbs = a->compressedrow.nrows; 1582 ii = a->compressedrow.i; 1583 ridx = a->compressedrow.rindex; 1584 } else { 1585 ii = a->i; 1586 y = yarray; 1587 z = zarray; 1588 } 1589 1590 for (i=0; i<mbs; i++) { 1591 n = ii[1] - ii[0]; ii++; 1592 if (usecprow) { 1593 z = zarray + 6*ridx[i]; 1594 y = yarray + 6*ridx[i]; 1595 } 1596 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1597 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1598 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1599 for (j=0; j<n; j++) { 1600 xb = x + 6*(*idx++); 1601 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1602 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1603 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1604 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1605 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1606 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1607 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1608 v += 36; 1609 } 1610 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1611 if (!usecprow) { 1612 z += 6; y += 6; 1613 } 1614 } 1615 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1616 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1617 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1618 PetscFunctionReturn(0); 1619 } 1620 1621 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1622 { 1623 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1624 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1625 const PetscScalar *x,*xb; 1626 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1627 const MatScalar *v; 1628 PetscErrorCode ierr; 1629 PetscInt mbs = a->mbs,i,j,n; 1630 const PetscInt *idx,*ii,*ridx = NULL; 1631 PetscBool usecprow=a->compressedrow.use; 1632 1633 PetscFunctionBegin; 1634 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1635 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1636 1637 idx = a->j; 1638 v = a->a; 1639 if (usecprow) { 1640 if (zz != yy) { 1641 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1642 } 1643 mbs = a->compressedrow.nrows; 1644 ii = a->compressedrow.i; 1645 ridx = a->compressedrow.rindex; 1646 } else { 1647 ii = a->i; 1648 y = yarray; 1649 z = zarray; 1650 } 1651 1652 for (i=0; i<mbs; i++) { 1653 n = ii[1] - ii[0]; ii++; 1654 if (usecprow) { 1655 z = zarray + 7*ridx[i]; 1656 y = yarray + 7*ridx[i]; 1657 } 1658 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1659 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1660 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1661 for (j=0; j<n; j++) { 1662 xb = x + 7*(*idx++); 1663 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1664 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1665 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1666 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1667 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1668 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1669 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1670 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1671 v += 49; 1672 } 1673 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1674 if (!usecprow) { 1675 z += 7; y += 7; 1676 } 1677 } 1678 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1679 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1680 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1681 PetscFunctionReturn(0); 1682 } 1683 1684 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) 1685 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz) 1686 { 1687 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1688 PetscScalar *z = 0,*work,*workt,*zarray; 1689 const PetscScalar *x,*xb; 1690 const MatScalar *v; 1691 PetscErrorCode ierr; 1692 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1693 PetscInt k; 1694 PetscBool usecprow=a->compressedrow.use; 1695 const PetscInt *idx,*ii,*ridx=NULL; 1696 1697 __m256d a0,a1,a2,a3,a4,a5; 1698 __m256d w0,w1,w2,w3; 1699 __m256d z0,z1,z2; 1700 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 1701 1702 PetscFunctionBegin; 1703 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1704 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1705 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1706 1707 idx = a->j; 1708 v = a->a; 1709 if (usecprow) { 1710 mbs = a->compressedrow.nrows; 1711 ii = a->compressedrow.i; 1712 ridx = a->compressedrow.rindex; 1713 } else { 1714 mbs = a->mbs; 1715 ii = a->i; 1716 z = zarray; 1717 } 1718 1719 if (!a->mult_work) { 1720 k = PetscMax(A->rmap->n,A->cmap->n); 1721 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1722 } 1723 1724 work = a->mult_work; 1725 for (i=0; i<mbs; i++) { 1726 n = ii[1] - ii[0]; ii++; 1727 workt = work; 1728 for (j=0; j<n; j++) { 1729 xb = x + bs*(*idx++); 1730 for (k=0; k<bs; k++) workt[k] = xb[k]; 1731 workt += bs; 1732 } 1733 if (usecprow) z = zarray + bs*ridx[i]; 1734 1735 z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]); 1736 1737 for (j=0; j<n; j++) { 1738 // first column of a 1739 w0 = _mm256_set1_pd(work[j*9 ]); 1740 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 1741 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 1742 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 1743 1744 // second column of a 1745 w1 = _mm256_set1_pd(work[j*9+ 1]); 1746 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 1747 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 1748 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 1749 1750 // third column of a 1751 w2 = _mm256_set1_pd(work[j*9+ 2]); 1752 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 1753 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 1754 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 1755 1756 // fourth column of a 1757 w3 = _mm256_set1_pd(work[j*9+ 3]); 1758 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 1759 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 1760 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 1761 1762 // fifth column of a 1763 w0 = _mm256_set1_pd(work[j*9+ 4]); 1764 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 1765 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 1766 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 1767 1768 // sixth column of a 1769 w1 = _mm256_set1_pd(work[j*9+ 5]); 1770 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 1771 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 1772 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 1773 1774 // seventh column of a 1775 w2 = _mm256_set1_pd(work[j*9+ 6]); 1776 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 1777 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 1778 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 1779 1780 // eigth column of a 1781 w3 = _mm256_set1_pd(work[j*9+ 7]); 1782 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 1783 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 1784 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 1785 1786 // ninth column of a 1787 w0 = _mm256_set1_pd(work[j*9+ 8]); 1788 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 1789 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 1790 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 1791 } 1792 1793 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 1794 1795 v += n*bs2; 1796 if (!usecprow) z += bs; 1797 } 1798 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1799 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1800 ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr); 1801 PetscFunctionReturn(0); 1802 } 1803 #endif 1804 1805 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz) 1806 { 1807 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1808 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 1809 const PetscScalar *x,*xb; 1810 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray; 1811 const MatScalar *v; 1812 PetscErrorCode ierr; 1813 PetscInt mbs = a->mbs,i,j,n; 1814 const PetscInt *idx,*ii,*ridx = NULL; 1815 PetscBool usecprow=a->compressedrow.use; 1816 1817 PetscFunctionBegin; 1818 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1819 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1820 1821 idx = a->j; 1822 v = a->a; 1823 if (usecprow) { 1824 if (zz != yy) { 1825 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1826 } 1827 mbs = a->compressedrow.nrows; 1828 ii = a->compressedrow.i; 1829 ridx = a->compressedrow.rindex; 1830 } else { 1831 ii = a->i; 1832 y = yarray; 1833 z = zarray; 1834 } 1835 1836 for (i=0; i<mbs; i++) { 1837 n = ii[1] - ii[0]; ii++; 1838 if (usecprow) { 1839 z = zarray + 11*ridx[i]; 1840 y = yarray + 11*ridx[i]; 1841 } 1842 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1843 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; 1844 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1845 PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1846 for (j=0; j<n; j++) { 1847 xb = x + 11*(*idx++); 1848 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; 1849 sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11; 1850 sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11; 1851 sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11; 1852 sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11; 1853 sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11; 1854 sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11; 1855 sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11; 1856 sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11; 1857 sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11; 1858 sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11; 1859 sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11; 1860 v += 121; 1861 } 1862 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1863 z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10; 1864 if (!usecprow) { 1865 z += 11; y += 11; 1866 } 1867 } 1868 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1869 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1870 ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr); 1871 PetscFunctionReturn(0); 1872 } 1873 1874 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1875 { 1876 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1877 PetscScalar *z = 0,*work,*workt,*zarray; 1878 const PetscScalar *x,*xb; 1879 const MatScalar *v; 1880 PetscErrorCode ierr; 1881 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1882 PetscInt ncols,k; 1883 const PetscInt *ridx = NULL,*idx,*ii; 1884 PetscBool usecprow = a->compressedrow.use; 1885 1886 PetscFunctionBegin; 1887 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1888 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1889 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1890 1891 idx = a->j; 1892 v = a->a; 1893 if (usecprow) { 1894 mbs = a->compressedrow.nrows; 1895 ii = a->compressedrow.i; 1896 ridx = a->compressedrow.rindex; 1897 } else { 1898 mbs = a->mbs; 1899 ii = a->i; 1900 z = zarray; 1901 } 1902 1903 if (!a->mult_work) { 1904 k = PetscMax(A->rmap->n,A->cmap->n); 1905 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1906 } 1907 work = a->mult_work; 1908 for (i=0; i<mbs; i++) { 1909 n = ii[1] - ii[0]; ii++; 1910 ncols = n*bs; 1911 workt = work; 1912 for (j=0; j<n; j++) { 1913 xb = x + bs*(*idx++); 1914 for (k=0; k<bs; k++) workt[k] = xb[k]; 1915 workt += bs; 1916 } 1917 if (usecprow) z = zarray + bs*ridx[i]; 1918 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1919 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1920 v += n*bs2; 1921 if (!usecprow) z += bs; 1922 } 1923 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1924 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1925 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1926 PetscFunctionReturn(0); 1927 } 1928 1929 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1930 { 1931 PetscScalar zero = 0.0; 1932 PetscErrorCode ierr; 1933 1934 PetscFunctionBegin; 1935 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1936 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1937 PetscFunctionReturn(0); 1938 } 1939 1940 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1941 { 1942 PetscScalar zero = 0.0; 1943 PetscErrorCode ierr; 1944 1945 PetscFunctionBegin; 1946 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1947 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1948 PetscFunctionReturn(0); 1949 } 1950 1951 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1952 { 1953 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1954 PetscScalar *z,x1,x2,x3,x4,x5; 1955 const PetscScalar *x,*xb = NULL; 1956 const MatScalar *v; 1957 PetscErrorCode ierr; 1958 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 1959 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1960 Mat_CompressedRow cprow = a->compressedrow; 1961 PetscBool usecprow = cprow.use; 1962 1963 PetscFunctionBegin; 1964 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1965 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1966 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1967 1968 idx = a->j; 1969 v = a->a; 1970 if (usecprow) { 1971 mbs = cprow.nrows; 1972 ii = cprow.i; 1973 ridx = cprow.rindex; 1974 } else { 1975 mbs=a->mbs; 1976 ii = a->i; 1977 xb = x; 1978 } 1979 1980 switch (bs) { 1981 case 1: 1982 for (i=0; i<mbs; i++) { 1983 if (usecprow) xb = x + ridx[i]; 1984 x1 = xb[0]; 1985 ib = idx + ii[0]; 1986 n = ii[1] - ii[0]; ii++; 1987 for (j=0; j<n; j++) { 1988 rval = ib[j]; 1989 z[rval] += PetscConj(*v) * x1; 1990 v++; 1991 } 1992 if (!usecprow) xb++; 1993 } 1994 break; 1995 case 2: 1996 for (i=0; i<mbs; i++) { 1997 if (usecprow) xb = x + 2*ridx[i]; 1998 x1 = xb[0]; x2 = xb[1]; 1999 ib = idx + ii[0]; 2000 n = ii[1] - ii[0]; ii++; 2001 for (j=0; j<n; j++) { 2002 rval = ib[j]*2; 2003 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2004 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2005 v += 4; 2006 } 2007 if (!usecprow) xb += 2; 2008 } 2009 break; 2010 case 3: 2011 for (i=0; i<mbs; i++) { 2012 if (usecprow) xb = x + 3*ridx[i]; 2013 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2014 ib = idx + ii[0]; 2015 n = ii[1] - ii[0]; ii++; 2016 for (j=0; j<n; j++) { 2017 rval = ib[j]*3; 2018 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2019 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2020 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2021 v += 9; 2022 } 2023 if (!usecprow) xb += 3; 2024 } 2025 break; 2026 case 4: 2027 for (i=0; i<mbs; i++) { 2028 if (usecprow) xb = x + 4*ridx[i]; 2029 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2030 ib = idx + ii[0]; 2031 n = ii[1] - ii[0]; ii++; 2032 for (j=0; j<n; j++) { 2033 rval = ib[j]*4; 2034 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2035 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2036 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2037 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2038 v += 16; 2039 } 2040 if (!usecprow) xb += 4; 2041 } 2042 break; 2043 case 5: 2044 for (i=0; i<mbs; i++) { 2045 if (usecprow) xb = x + 5*ridx[i]; 2046 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2047 x4 = xb[3]; x5 = xb[4]; 2048 ib = idx + ii[0]; 2049 n = ii[1] - ii[0]; ii++; 2050 for (j=0; j<n; j++) { 2051 rval = ib[j]*5; 2052 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2053 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2054 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2055 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2056 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2057 v += 25; 2058 } 2059 if (!usecprow) xb += 5; 2060 } 2061 break; 2062 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2063 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2064 #if 0 2065 { 2066 PetscInt ncols,k,bs2=a->bs2; 2067 PetscScalar *work,*workt,zb; 2068 const PetscScalar *xtmp; 2069 if (!a->mult_work) { 2070 k = PetscMax(A->rmap->n,A->cmap->n); 2071 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2072 } 2073 work = a->mult_work; 2074 xtmp = x; 2075 for (i=0; i<mbs; i++) { 2076 n = ii[1] - ii[0]; ii++; 2077 ncols = n*bs; 2078 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2079 if (usecprow) xtmp = x + bs*ridx[i]; 2080 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2081 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2082 v += n*bs2; 2083 if (!usecprow) xtmp += bs; 2084 workt = work; 2085 for (j=0; j<n; j++) { 2086 zb = z + bs*(*idx++); 2087 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2088 workt += bs; 2089 } 2090 } 2091 } 2092 #endif 2093 } 2094 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2095 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2096 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2097 PetscFunctionReturn(0); 2098 } 2099 2100 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2101 { 2102 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2103 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2104 const PetscScalar *x,*xb = 0; 2105 const MatScalar *v; 2106 PetscErrorCode ierr; 2107 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2108 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2109 Mat_CompressedRow cprow = a->compressedrow; 2110 PetscBool usecprow=cprow.use; 2111 2112 PetscFunctionBegin; 2113 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2114 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2115 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2116 2117 idx = a->j; 2118 v = a->a; 2119 if (usecprow) { 2120 mbs = cprow.nrows; 2121 ii = cprow.i; 2122 ridx = cprow.rindex; 2123 } else { 2124 mbs=a->mbs; 2125 ii = a->i; 2126 xb = x; 2127 } 2128 2129 switch (bs) { 2130 case 1: 2131 for (i=0; i<mbs; i++) { 2132 if (usecprow) xb = x + ridx[i]; 2133 x1 = xb[0]; 2134 ib = idx + ii[0]; 2135 n = ii[1] - ii[0]; ii++; 2136 for (j=0; j<n; j++) { 2137 rval = ib[j]; 2138 z[rval] += *v * x1; 2139 v++; 2140 } 2141 if (!usecprow) xb++; 2142 } 2143 break; 2144 case 2: 2145 for (i=0; i<mbs; i++) { 2146 if (usecprow) xb = x + 2*ridx[i]; 2147 x1 = xb[0]; x2 = xb[1]; 2148 ib = idx + ii[0]; 2149 n = ii[1] - ii[0]; ii++; 2150 for (j=0; j<n; j++) { 2151 rval = ib[j]*2; 2152 z[rval++] += v[0]*x1 + v[1]*x2; 2153 z[rval++] += v[2]*x1 + v[3]*x2; 2154 v += 4; 2155 } 2156 if (!usecprow) xb += 2; 2157 } 2158 break; 2159 case 3: 2160 for (i=0; i<mbs; i++) { 2161 if (usecprow) xb = x + 3*ridx[i]; 2162 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2163 ib = idx + ii[0]; 2164 n = ii[1] - ii[0]; ii++; 2165 for (j=0; j<n; j++) { 2166 rval = ib[j]*3; 2167 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2168 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2169 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2170 v += 9; 2171 } 2172 if (!usecprow) xb += 3; 2173 } 2174 break; 2175 case 4: 2176 for (i=0; i<mbs; i++) { 2177 if (usecprow) xb = x + 4*ridx[i]; 2178 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2179 ib = idx + ii[0]; 2180 n = ii[1] - ii[0]; ii++; 2181 for (j=0; j<n; j++) { 2182 rval = ib[j]*4; 2183 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2184 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2185 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2186 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2187 v += 16; 2188 } 2189 if (!usecprow) xb += 4; 2190 } 2191 break; 2192 case 5: 2193 for (i=0; i<mbs; i++) { 2194 if (usecprow) xb = x + 5*ridx[i]; 2195 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2196 x4 = xb[3]; x5 = xb[4]; 2197 ib = idx + ii[0]; 2198 n = ii[1] - ii[0]; ii++; 2199 for (j=0; j<n; j++) { 2200 rval = ib[j]*5; 2201 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2202 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2203 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2204 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2205 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2206 v += 25; 2207 } 2208 if (!usecprow) xb += 5; 2209 } 2210 break; 2211 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2212 PetscInt ncols,k; 2213 PetscScalar *work,*workt; 2214 const PetscScalar *xtmp; 2215 if (!a->mult_work) { 2216 k = PetscMax(A->rmap->n,A->cmap->n); 2217 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2218 } 2219 work = a->mult_work; 2220 xtmp = x; 2221 for (i=0; i<mbs; i++) { 2222 n = ii[1] - ii[0]; ii++; 2223 ncols = n*bs; 2224 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2225 if (usecprow) xtmp = x + bs*ridx[i]; 2226 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2227 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2228 v += n*bs2; 2229 if (!usecprow) xtmp += bs; 2230 workt = work; 2231 for (j=0; j<n; j++) { 2232 zb = z + bs*(*idx++); 2233 for (k=0; k<bs; k++) zb[k] += workt[k]; 2234 workt += bs; 2235 } 2236 } 2237 } 2238 } 2239 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2240 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2241 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2242 PetscFunctionReturn(0); 2243 } 2244 2245 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2246 { 2247 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2248 PetscInt totalnz = a->bs2*a->nz; 2249 PetscScalar oalpha = alpha; 2250 PetscErrorCode ierr; 2251 PetscBLASInt one = 1,tnz; 2252 2253 PetscFunctionBegin; 2254 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 2255 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2256 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 2257 PetscFunctionReturn(0); 2258 } 2259 2260 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2261 { 2262 PetscErrorCode ierr; 2263 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2264 MatScalar *v = a->a; 2265 PetscReal sum = 0.0; 2266 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2267 2268 PetscFunctionBegin; 2269 if (type == NORM_FROBENIUS) { 2270 #if defined(PETSC_USE_REAL___FP16) 2271 PetscBLASInt one = 1,cnt = bs2*nz; 2272 *norm = BLASnrm2_(&cnt,v,&one); 2273 #else 2274 for (i=0; i<bs2*nz; i++) { 2275 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2276 } 2277 #endif 2278 *norm = PetscSqrtReal(sum); 2279 ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr); 2280 } else if (type == NORM_1) { /* maximum column sum */ 2281 PetscReal *tmp; 2282 PetscInt *bcol = a->j; 2283 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 2284 for (i=0; i<nz; i++) { 2285 for (j=0; j<bs; j++) { 2286 k1 = bs*(*bcol) + j; /* column index */ 2287 for (k=0; k<bs; k++) { 2288 tmp[k1] += PetscAbsScalar(*v); v++; 2289 } 2290 } 2291 bcol++; 2292 } 2293 *norm = 0.0; 2294 for (j=0; j<A->cmap->n; j++) { 2295 if (tmp[j] > *norm) *norm = tmp[j]; 2296 } 2297 ierr = PetscFree(tmp);CHKERRQ(ierr); 2298 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2299 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2300 *norm = 0.0; 2301 for (k=0; k<bs; k++) { 2302 for (j=0; j<a->mbs; j++) { 2303 v = a->a + bs2*a->i[j] + k; 2304 sum = 0.0; 2305 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2306 for (k1=0; k1<bs; k1++) { 2307 sum += PetscAbsScalar(*v); 2308 v += bs; 2309 } 2310 } 2311 if (sum > *norm) *norm = sum; 2312 } 2313 } 2314 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2315 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2316 PetscFunctionReturn(0); 2317 } 2318 2319 2320 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2321 { 2322 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2323 PetscErrorCode ierr; 2324 2325 PetscFunctionBegin; 2326 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2327 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2328 *flg = PETSC_FALSE; 2329 PetscFunctionReturn(0); 2330 } 2331 2332 /* if the a->i are the same */ 2333 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2334 if (!*flg) PetscFunctionReturn(0); 2335 2336 /* if a->j are the same */ 2337 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2338 if (!*flg) PetscFunctionReturn(0); 2339 2340 /* if a->a are the same */ 2341 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 2342 PetscFunctionReturn(0); 2343 2344 } 2345 2346 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2347 { 2348 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2349 PetscErrorCode ierr; 2350 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2351 PetscScalar *x,zero = 0.0; 2352 MatScalar *aa,*aa_j; 2353 2354 PetscFunctionBegin; 2355 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2356 bs = A->rmap->bs; 2357 aa = a->a; 2358 ai = a->i; 2359 aj = a->j; 2360 ambs = a->mbs; 2361 bs2 = a->bs2; 2362 2363 ierr = VecSet(v,zero);CHKERRQ(ierr); 2364 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2365 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2366 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2367 for (i=0; i<ambs; i++) { 2368 for (j=ai[i]; j<ai[i+1]; j++) { 2369 if (aj[j] == i) { 2370 row = i*bs; 2371 aa_j = aa+j*bs2; 2372 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2373 break; 2374 } 2375 } 2376 } 2377 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2378 PetscFunctionReturn(0); 2379 } 2380 2381 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2382 { 2383 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2384 const PetscScalar *l,*r,*li,*ri; 2385 PetscScalar x; 2386 MatScalar *aa, *v; 2387 PetscErrorCode ierr; 2388 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2389 const PetscInt *ai,*aj; 2390 2391 PetscFunctionBegin; 2392 ai = a->i; 2393 aj = a->j; 2394 aa = a->a; 2395 m = A->rmap->n; 2396 n = A->cmap->n; 2397 bs = A->rmap->bs; 2398 mbs = a->mbs; 2399 bs2 = a->bs2; 2400 if (ll) { 2401 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2402 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2403 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2404 for (i=0; i<mbs; i++) { /* for each block row */ 2405 M = ai[i+1] - ai[i]; 2406 li = l + i*bs; 2407 v = aa + bs2*ai[i]; 2408 for (j=0; j<M; j++) { /* for each block */ 2409 for (k=0; k<bs2; k++) { 2410 (*v++) *= li[k%bs]; 2411 } 2412 } 2413 } 2414 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2415 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2416 } 2417 2418 if (rr) { 2419 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2420 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2421 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2422 for (i=0; i<mbs; i++) { /* for each block row */ 2423 iai = ai[i]; 2424 M = ai[i+1] - iai; 2425 v = aa + bs2*iai; 2426 for (j=0; j<M; j++) { /* for each block */ 2427 ri = r + bs*aj[iai+j]; 2428 for (k=0; k<bs; k++) { 2429 x = ri[k]; 2430 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2431 v += bs; 2432 } 2433 } 2434 } 2435 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2436 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2437 } 2438 PetscFunctionReturn(0); 2439 } 2440 2441 2442 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2443 { 2444 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2445 2446 PetscFunctionBegin; 2447 info->block_size = a->bs2; 2448 info->nz_allocated = a->bs2*a->maxnz; 2449 info->nz_used = a->bs2*a->nz; 2450 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2451 info->assemblies = A->num_ass; 2452 info->mallocs = A->info.mallocs; 2453 info->memory = ((PetscObject)A)->mem; 2454 if (A->factortype) { 2455 info->fill_ratio_given = A->info.fill_ratio_given; 2456 info->fill_ratio_needed = A->info.fill_ratio_needed; 2457 info->factor_mallocs = A->info.factor_mallocs; 2458 } else { 2459 info->fill_ratio_given = 0; 2460 info->fill_ratio_needed = 0; 2461 info->factor_mallocs = 0; 2462 } 2463 PetscFunctionReturn(0); 2464 } 2465 2466 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2467 { 2468 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2469 PetscErrorCode ierr; 2470 2471 PetscFunctionBegin; 2472 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2473 PetscFunctionReturn(0); 2474 } 2475