1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <../src/mat/impls/dense/seq/dense.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 8 #include <immintrin.h> 9 #endif 10 11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 12 { 13 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 14 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 15 const PetscInt *idx; 16 PetscInt start,end,*ai,*aj,bs,*nidx2; 17 PetscBT table; 18 19 PetscFunctionBegin; 20 m = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 25 PetscCheck(ov >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 26 27 PetscCall(PetscBTCreate(m,&table)); 28 PetscCall(PetscMalloc1(m+1,&nidx)); 29 PetscCall(PetscMalloc1(A->rmap->N+1,&nidx2)); 30 31 for (i=0; i<is_max; i++) { 32 /* Initialise the two local arrays */ 33 isz = 0; 34 PetscCall(PetscBTMemzero(m,table)); 35 36 /* Extract the indices, assume there can be duplicate entries */ 37 PetscCall(ISGetIndices(is[i],&idx)); 38 PetscCall(ISGetLocalSize(is[i],&n)); 39 40 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 41 for (j=0; j<n; ++j) { 42 ival = idx[j]/bs; /* convert the indices into block indices */ 43 PetscCheck(ival<m,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 44 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 45 } 46 PetscCall(ISRestoreIndices(is[i],&idx)); 47 PetscCall(ISDestroy(&is[i])); 48 49 k = 0; 50 for (j=0; j<ov; j++) { /* for each overlap*/ 51 n = isz; 52 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 53 row = nidx[k]; 54 start = ai[row]; 55 end = ai[row+1]; 56 for (l = start; l<end; l++) { 57 val = aj[l]; 58 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 59 } 60 } 61 } 62 /* expand the Index Set */ 63 for (j=0; j<isz; j++) { 64 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 65 } 66 PetscCall(ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i)); 67 } 68 PetscCall(PetscBTDestroy(&table)); 69 PetscCall(PetscFree(nidx)); 70 PetscCall(PetscFree(nidx2)); 71 PetscFunctionReturn(0); 72 } 73 74 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 75 { 76 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 77 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 78 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 79 const PetscInt *irow,*icol; 80 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 81 PetscInt *aj = a->j,*ai = a->i; 82 MatScalar *mat_a; 83 Mat C; 84 PetscBool flag; 85 86 PetscFunctionBegin; 87 PetscCall(ISGetIndices(isrow,&irow)); 88 PetscCall(ISGetIndices(iscol,&icol)); 89 PetscCall(ISGetLocalSize(isrow,&nrows)); 90 PetscCall(ISGetLocalSize(iscol,&ncols)); 91 92 PetscCall(PetscCalloc1(1+oldcols,&smap)); 93 ssmap = smap; 94 PetscCall(PetscMalloc1(1+nrows,&lens)); 95 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 96 /* determine lens of each row */ 97 for (i=0; i<nrows; i++) { 98 kstart = ai[irow[i]]; 99 kend = kstart + a->ilen[irow[i]]; 100 lens[i] = 0; 101 for (k=kstart; k<kend; k++) { 102 if (ssmap[aj[k]]) lens[i]++; 103 } 104 } 105 /* Create and fill new matrix */ 106 if (scall == MAT_REUSE_MATRIX) { 107 c = (Mat_SeqBAIJ*)((*B)->data); 108 109 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 110 PetscCall(PetscArraycmp(c->ilen,lens,c->mbs,&flag)); 111 PetscCheck(flag,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 112 PetscCall(PetscArrayzero(c->ilen,c->mbs)); 113 C = *B; 114 } else { 115 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&C)); 116 PetscCall(MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE)); 117 PetscCall(MatSetType(C,((PetscObject)A)->type_name)); 118 PetscCall(MatSeqBAIJSetPreallocation(C,bs,0,lens)); 119 } 120 c = (Mat_SeqBAIJ*)(C->data); 121 for (i=0; i<nrows; i++) { 122 row = irow[i]; 123 kstart = ai[row]; 124 kend = kstart + a->ilen[row]; 125 mat_i = c->i[i]; 126 mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ 127 mat_a = c->a ? c->a + mat_i*bs2 : NULL; /* mustn't add to NULL, that is UB */ 128 mat_ilen = c->ilen + i; 129 for (k=kstart; k<kend; k++) { 130 if ((tcol=ssmap[a->j[k]])) { 131 *mat_j++ = tcol - 1; 132 PetscCall(PetscArraycpy(mat_a,a->a+k*bs2,bs2)); 133 mat_a += bs2; 134 (*mat_ilen)++; 135 } 136 } 137 } 138 /* sort */ 139 if (c->j && c->a) { 140 MatScalar *work; 141 PetscCall(PetscMalloc1(bs2,&work)); 142 for (i=0; i<nrows; i++) { 143 PetscInt ilen; 144 mat_i = c->i[i]; 145 mat_j = c->j + mat_i; 146 mat_a = c->a + mat_i*bs2; 147 ilen = c->ilen[i]; 148 PetscCall(PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work)); 149 } 150 PetscCall(PetscFree(work)); 151 } 152 153 /* Free work space */ 154 PetscCall(ISRestoreIndices(iscol,&icol)); 155 PetscCall(PetscFree(smap)); 156 PetscCall(PetscFree(lens)); 157 PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 158 PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 159 160 PetscCall(ISRestoreIndices(isrow,&irow)); 161 *B = C; 162 PetscFunctionReturn(0); 163 } 164 165 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 166 { 167 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 168 IS is1,is2; 169 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 170 const PetscInt *irow,*icol; 171 172 PetscFunctionBegin; 173 PetscCall(ISGetIndices(isrow,&irow)); 174 PetscCall(ISGetIndices(iscol,&icol)); 175 PetscCall(ISGetLocalSize(isrow,&nrows)); 176 PetscCall(ISGetLocalSize(iscol,&ncols)); 177 178 /* Verify if the indices corespond to each element in a block 179 and form the IS with compressed IS */ 180 maxmnbs = PetscMax(a->mbs,a->nbs); 181 PetscCall(PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary)); 182 PetscCall(PetscArrayzero(vary,a->mbs)); 183 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 184 for (i=0; i<a->mbs; i++) { 185 PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 186 } 187 count = 0; 188 for (i=0; i<nrows; i++) { 189 j = irow[i] / bs; 190 if ((vary[j]--)==bs) iary[count++] = j; 191 } 192 PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1)); 193 194 PetscCall(PetscArrayzero(vary,a->nbs)); 195 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 196 for (i=0; i<a->nbs; i++) { 197 PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 198 } 199 count = 0; 200 for (i=0; i<ncols; i++) { 201 j = icol[i] / bs; 202 if ((vary[j]--)==bs) iary[count++] = j; 203 } 204 PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2)); 205 PetscCall(ISRestoreIndices(isrow,&irow)); 206 PetscCall(ISRestoreIndices(iscol,&icol)); 207 PetscCall(PetscFree2(vary,iary)); 208 209 PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B)); 210 PetscCall(ISDestroy(&is1)); 211 PetscCall(ISDestroy(&is2)); 212 PetscFunctionReturn(0); 213 } 214 215 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 216 { 217 Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data; 218 Mat_SubSppt *submatj = c->submatis1; 219 220 PetscFunctionBegin; 221 PetscCall((*submatj->destroy)(C)); 222 PetscCall(MatDestroySubMatrix_Private(submatj)); 223 PetscFunctionReturn(0); 224 } 225 226 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[]) 227 { 228 PetscInt i; 229 Mat C; 230 Mat_SeqBAIJ *c; 231 Mat_SubSppt *submatj; 232 233 PetscFunctionBegin; 234 for (i=0; i<n; i++) { 235 C = (*mat)[i]; 236 c = (Mat_SeqBAIJ*)C->data; 237 submatj = c->submatis1; 238 if (submatj) { 239 if (--((PetscObject)C)->refct <= 0) { 240 PetscCall((*submatj->destroy)(C)); 241 PetscCall(MatDestroySubMatrix_Private(submatj)); 242 PetscCall(PetscFree(C->defaultvectype)); 243 PetscCall(PetscLayoutDestroy(&C->rmap)); 244 PetscCall(PetscLayoutDestroy(&C->cmap)); 245 PetscCall(PetscHeaderDestroy(&C)); 246 } 247 } else { 248 PetscCall(MatDestroy(&C)); 249 } 250 } 251 252 /* Destroy Dummy submatrices created for reuse */ 253 PetscCall(MatDestroySubMatrices_Dummy(n,mat)); 254 255 PetscCall(PetscFree(*mat)); 256 PetscFunctionReturn(0); 257 } 258 259 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 260 { 261 PetscInt i; 262 263 PetscFunctionBegin; 264 if (scall == MAT_INITIAL_MATRIX) { 265 PetscCall(PetscCalloc1(n+1,B)); 266 } 267 268 for (i=0; i<n; i++) { 269 PetscCall(MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i])); 270 } 271 PetscFunctionReturn(0); 272 } 273 274 /* -------------------------------------------------------*/ 275 /* Should check that shapes of vectors and matrices match */ 276 /* -------------------------------------------------------*/ 277 278 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 279 { 280 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 281 PetscScalar *z,sum; 282 const PetscScalar *x; 283 const MatScalar *v; 284 PetscInt mbs,i,n; 285 const PetscInt *idx,*ii,*ridx=NULL; 286 PetscBool usecprow=a->compressedrow.use; 287 288 PetscFunctionBegin; 289 PetscCall(VecGetArrayRead(xx,&x)); 290 PetscCall(VecGetArrayWrite(zz,&z)); 291 292 if (usecprow) { 293 mbs = a->compressedrow.nrows; 294 ii = a->compressedrow.i; 295 ridx = a->compressedrow.rindex; 296 PetscCall(PetscArrayzero(z,a->mbs)); 297 } else { 298 mbs = a->mbs; 299 ii = a->i; 300 } 301 302 for (i=0; i<mbs; i++) { 303 n = ii[1] - ii[0]; 304 v = a->a + ii[0]; 305 idx = a->j + ii[0]; 306 ii++; 307 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 308 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 309 sum = 0.0; 310 PetscSparseDensePlusDot(sum,x,v,idx,n); 311 if (usecprow) { 312 z[ridx[i]] = sum; 313 } else { 314 z[i] = sum; 315 } 316 } 317 PetscCall(VecRestoreArrayRead(xx,&x)); 318 PetscCall(VecRestoreArrayWrite(zz,&z)); 319 PetscCall(PetscLogFlops(2.0*a->nz - a->nonzerorowcnt)); 320 PetscFunctionReturn(0); 321 } 322 323 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 324 { 325 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 326 PetscScalar *z = NULL,sum1,sum2,*zarray; 327 const PetscScalar *x,*xb; 328 PetscScalar x1,x2; 329 const MatScalar *v; 330 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 331 PetscBool usecprow=a->compressedrow.use; 332 333 PetscFunctionBegin; 334 PetscCall(VecGetArrayRead(xx,&x)); 335 PetscCall(VecGetArrayWrite(zz,&zarray)); 336 337 idx = a->j; 338 v = a->a; 339 if (usecprow) { 340 mbs = a->compressedrow.nrows; 341 ii = a->compressedrow.i; 342 ridx = a->compressedrow.rindex; 343 PetscCall(PetscArrayzero(zarray,2*a->mbs)); 344 } else { 345 mbs = a->mbs; 346 ii = a->i; 347 z = zarray; 348 } 349 350 for (i=0; i<mbs; i++) { 351 n = ii[1] - ii[0]; ii++; 352 sum1 = 0.0; sum2 = 0.0; 353 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 354 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 355 for (j=0; j<n; j++) { 356 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 357 sum1 += v[0]*x1 + v[2]*x2; 358 sum2 += v[1]*x1 + v[3]*x2; 359 v += 4; 360 } 361 if (usecprow) z = zarray + 2*ridx[i]; 362 z[0] = sum1; z[1] = sum2; 363 if (!usecprow) z += 2; 364 } 365 PetscCall(VecRestoreArrayRead(xx,&x)); 366 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 367 PetscCall(PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt)); 368 PetscFunctionReturn(0); 369 } 370 371 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 372 { 373 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 374 PetscScalar *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray; 375 const PetscScalar *x,*xb; 376 const MatScalar *v; 377 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 378 PetscBool usecprow=a->compressedrow.use; 379 380 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 381 #pragma disjoint(*v,*z,*xb) 382 #endif 383 384 PetscFunctionBegin; 385 PetscCall(VecGetArrayRead(xx,&x)); 386 PetscCall(VecGetArrayWrite(zz,&zarray)); 387 388 idx = a->j; 389 v = a->a; 390 if (usecprow) { 391 mbs = a->compressedrow.nrows; 392 ii = a->compressedrow.i; 393 ridx = a->compressedrow.rindex; 394 PetscCall(PetscArrayzero(zarray,3*a->mbs)); 395 } else { 396 mbs = a->mbs; 397 ii = a->i; 398 z = zarray; 399 } 400 401 for (i=0; i<mbs; i++) { 402 n = ii[1] - ii[0]; ii++; 403 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 404 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 405 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 406 for (j=0; j<n; j++) { 407 xb = x + 3*(*idx++); 408 x1 = xb[0]; 409 x2 = xb[1]; 410 x3 = xb[2]; 411 412 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 413 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 414 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 415 v += 9; 416 } 417 if (usecprow) z = zarray + 3*ridx[i]; 418 z[0] = sum1; z[1] = sum2; z[2] = sum3; 419 if (!usecprow) z += 3; 420 } 421 PetscCall(VecRestoreArrayRead(xx,&x)); 422 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 423 PetscCall(PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt)); 424 PetscFunctionReturn(0); 425 } 426 427 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 428 { 429 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 430 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 431 const PetscScalar *x,*xb; 432 const MatScalar *v; 433 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 434 PetscBool usecprow=a->compressedrow.use; 435 436 PetscFunctionBegin; 437 PetscCall(VecGetArrayRead(xx,&x)); 438 PetscCall(VecGetArrayWrite(zz,&zarray)); 439 440 idx = a->j; 441 v = a->a; 442 if (usecprow) { 443 mbs = a->compressedrow.nrows; 444 ii = a->compressedrow.i; 445 ridx = a->compressedrow.rindex; 446 PetscCall(PetscArrayzero(zarray,4*a->mbs)); 447 } else { 448 mbs = a->mbs; 449 ii = a->i; 450 z = zarray; 451 } 452 453 for (i=0; i<mbs; i++) { 454 n = ii[1] - ii[0]; 455 ii++; 456 sum1 = 0.0; 457 sum2 = 0.0; 458 sum3 = 0.0; 459 sum4 = 0.0; 460 461 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 462 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 463 for (j=0; j<n; j++) { 464 xb = x + 4*(*idx++); 465 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 466 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 467 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 468 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 469 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 470 v += 16; 471 } 472 if (usecprow) z = zarray + 4*ridx[i]; 473 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 474 if (!usecprow) z += 4; 475 } 476 PetscCall(VecRestoreArrayRead(xx,&x)); 477 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 478 PetscCall(PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt)); 479 PetscFunctionReturn(0); 480 } 481 482 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 483 { 484 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 485 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray; 486 const PetscScalar *xb,*x; 487 const MatScalar *v; 488 const PetscInt *idx,*ii,*ridx=NULL; 489 PetscInt mbs,i,j,n; 490 PetscBool usecprow=a->compressedrow.use; 491 492 PetscFunctionBegin; 493 PetscCall(VecGetArrayRead(xx,&x)); 494 PetscCall(VecGetArrayWrite(zz,&zarray)); 495 496 idx = a->j; 497 v = a->a; 498 if (usecprow) { 499 mbs = a->compressedrow.nrows; 500 ii = a->compressedrow.i; 501 ridx = a->compressedrow.rindex; 502 PetscCall(PetscArrayzero(zarray,5*a->mbs)); 503 } else { 504 mbs = a->mbs; 505 ii = a->i; 506 z = zarray; 507 } 508 509 for (i=0; i<mbs; i++) { 510 n = ii[1] - ii[0]; ii++; 511 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 512 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 513 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 514 for (j=0; j<n; j++) { 515 xb = x + 5*(*idx++); 516 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 517 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 518 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 519 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 520 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 521 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 522 v += 25; 523 } 524 if (usecprow) z = zarray + 5*ridx[i]; 525 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 526 if (!usecprow) z += 5; 527 } 528 PetscCall(VecRestoreArrayRead(xx,&x)); 529 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 530 PetscCall(PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt)); 531 PetscFunctionReturn(0); 532 } 533 534 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 535 { 536 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 537 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; 538 const PetscScalar *x,*xb; 539 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 540 const MatScalar *v; 541 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 542 PetscBool usecprow=a->compressedrow.use; 543 544 PetscFunctionBegin; 545 PetscCall(VecGetArrayRead(xx,&x)); 546 PetscCall(VecGetArrayWrite(zz,&zarray)); 547 548 idx = a->j; 549 v = a->a; 550 if (usecprow) { 551 mbs = a->compressedrow.nrows; 552 ii = a->compressedrow.i; 553 ridx = a->compressedrow.rindex; 554 PetscCall(PetscArrayzero(zarray,6*a->mbs)); 555 } else { 556 mbs = a->mbs; 557 ii = a->i; 558 z = zarray; 559 } 560 561 for (i=0; i<mbs; i++) { 562 n = ii[1] - ii[0]; 563 ii++; 564 sum1 = 0.0; 565 sum2 = 0.0; 566 sum3 = 0.0; 567 sum4 = 0.0; 568 sum5 = 0.0; 569 sum6 = 0.0; 570 571 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 572 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 573 for (j=0; j<n; j++) { 574 xb = x + 6*(*idx++); 575 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 576 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 577 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 578 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 579 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 580 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 581 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 582 v += 36; 583 } 584 if (usecprow) z = zarray + 6*ridx[i]; 585 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 586 if (!usecprow) z += 6; 587 } 588 589 PetscCall(VecRestoreArrayRead(xx,&x)); 590 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 591 PetscCall(PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt)); 592 PetscFunctionReturn(0); 593 } 594 595 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 596 { 597 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 598 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 599 const PetscScalar *x,*xb; 600 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 601 const MatScalar *v; 602 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 603 PetscBool usecprow=a->compressedrow.use; 604 605 PetscFunctionBegin; 606 PetscCall(VecGetArrayRead(xx,&x)); 607 PetscCall(VecGetArrayWrite(zz,&zarray)); 608 609 idx = a->j; 610 v = a->a; 611 if (usecprow) { 612 mbs = a->compressedrow.nrows; 613 ii = a->compressedrow.i; 614 ridx = a->compressedrow.rindex; 615 PetscCall(PetscArrayzero(zarray,7*a->mbs)); 616 } else { 617 mbs = a->mbs; 618 ii = a->i; 619 z = zarray; 620 } 621 622 for (i=0; i<mbs; i++) { 623 n = ii[1] - ii[0]; 624 ii++; 625 sum1 = 0.0; 626 sum2 = 0.0; 627 sum3 = 0.0; 628 sum4 = 0.0; 629 sum5 = 0.0; 630 sum6 = 0.0; 631 sum7 = 0.0; 632 633 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 634 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 635 for (j=0; j<n; j++) { 636 xb = x + 7*(*idx++); 637 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 638 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 639 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 640 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 641 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 642 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 643 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 644 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 645 v += 49; 646 } 647 if (usecprow) z = zarray + 7*ridx[i]; 648 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 649 if (!usecprow) z += 7; 650 } 651 652 PetscCall(VecRestoreArrayRead(xx,&x)); 653 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 654 PetscCall(PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt)); 655 PetscFunctionReturn(0); 656 } 657 658 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 659 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz) 660 { 661 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 662 PetscScalar *z = NULL,*work,*workt,*zarray; 663 const PetscScalar *x,*xb; 664 const MatScalar *v; 665 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 666 const PetscInt *idx,*ii,*ridx=NULL; 667 PetscInt k; 668 PetscBool usecprow=a->compressedrow.use; 669 670 __m256d a0,a1,a2,a3,a4,a5; 671 __m256d w0,w1,w2,w3; 672 __m256d z0,z1,z2; 673 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 674 675 PetscFunctionBegin; 676 PetscCall(VecGetArrayRead(xx,&x)); 677 PetscCall(VecGetArrayWrite(zz,&zarray)); 678 679 idx = a->j; 680 v = a->a; 681 if (usecprow) { 682 mbs = a->compressedrow.nrows; 683 ii = a->compressedrow.i; 684 ridx = a->compressedrow.rindex; 685 PetscCall(PetscArrayzero(zarray,bs*a->mbs)); 686 } else { 687 mbs = a->mbs; 688 ii = a->i; 689 z = zarray; 690 } 691 692 if (!a->mult_work) { 693 k = PetscMax(A->rmap->n,A->cmap->n); 694 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 695 } 696 697 work = a->mult_work; 698 for (i=0; i<mbs; i++) { 699 n = ii[1] - ii[0]; ii++; 700 workt = work; 701 for (j=0; j<n; j++) { 702 xb = x + bs*(*idx++); 703 for (k=0; k<bs; k++) workt[k] = xb[k]; 704 workt += bs; 705 } 706 if (usecprow) z = zarray + bs*ridx[i]; 707 708 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 709 710 for (j=0; j<n; j++) { 711 /* first column of a */ 712 w0 = _mm256_set1_pd(work[j*9 ]); 713 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 714 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 715 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 716 717 /* second column of a */ 718 w1 = _mm256_set1_pd(work[j*9+ 1]); 719 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 720 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 721 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 722 723 /* third column of a */ 724 w2 = _mm256_set1_pd(work[j*9 +2]); 725 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 726 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 727 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 728 729 /* fourth column of a */ 730 w3 = _mm256_set1_pd(work[j*9+ 3]); 731 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 732 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 733 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 734 735 /* fifth column of a */ 736 w0 = _mm256_set1_pd(work[j*9+ 4]); 737 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 738 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 739 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 740 741 /* sixth column of a */ 742 w1 = _mm256_set1_pd(work[j*9+ 5]); 743 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 744 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 745 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 746 747 /* seventh column of a */ 748 w2 = _mm256_set1_pd(work[j*9+ 6]); 749 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 750 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 751 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 752 753 /* eighth column of a */ 754 w3 = _mm256_set1_pd(work[j*9+ 7]); 755 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 756 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 757 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 758 759 /* ninth column of a */ 760 w0 = _mm256_set1_pd(work[j*9+ 8]); 761 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 762 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 763 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 764 } 765 766 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 767 768 v += n*bs2; 769 if (!usecprow) z += bs; 770 } 771 PetscCall(VecRestoreArrayRead(xx,&x)); 772 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 773 PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt)); 774 PetscFunctionReturn(0); 775 } 776 #endif 777 778 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz) 779 { 780 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 781 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 782 const PetscScalar *x,*xb; 783 PetscScalar *zarray,xv; 784 const MatScalar *v; 785 const PetscInt *ii,*ij=a->j,*idx; 786 PetscInt mbs,i,j,k,n,*ridx=NULL; 787 PetscBool usecprow=a->compressedrow.use; 788 789 PetscFunctionBegin; 790 PetscCall(VecGetArrayRead(xx,&x)); 791 PetscCall(VecGetArrayWrite(zz,&zarray)); 792 793 v = a->a; 794 if (usecprow) { 795 mbs = a->compressedrow.nrows; 796 ii = a->compressedrow.i; 797 ridx = a->compressedrow.rindex; 798 PetscCall(PetscArrayzero(zarray,11*a->mbs)); 799 } else { 800 mbs = a->mbs; 801 ii = a->i; 802 z = zarray; 803 } 804 805 for (i=0; i<mbs; i++) { 806 n = ii[i+1] - ii[i]; 807 idx = ij + ii[i]; 808 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 809 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; 810 811 for (j=0; j<n; j++) { 812 xb = x + 11*(idx[j]); 813 814 for (k=0; k<11; k++) { 815 xv = xb[k]; 816 sum1 += v[0]*xv; 817 sum2 += v[1]*xv; 818 sum3 += v[2]*xv; 819 sum4 += v[3]*xv; 820 sum5 += v[4]*xv; 821 sum6 += v[5]*xv; 822 sum7 += v[6]*xv; 823 sum8 += v[7]*xv; 824 sum9 += v[8]*xv; 825 sum10 += v[9]*xv; 826 sum11 += v[10]*xv; 827 v += 11; 828 } 829 } 830 if (usecprow) z = zarray + 11*ridx[i]; 831 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 832 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 833 834 if (!usecprow) z += 11; 835 } 836 837 PetscCall(VecRestoreArrayRead(xx,&x)); 838 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 839 PetscCall(PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt)); 840 PetscFunctionReturn(0); 841 } 842 843 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 844 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz) 845 { 846 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 847 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 848 const PetscScalar *x,*xb; 849 PetscScalar *zarray,xv; 850 const MatScalar *v; 851 const PetscInt *ii,*ij=a->j,*idx; 852 PetscInt mbs,i,j,k,n,*ridx=NULL; 853 PetscBool usecprow=a->compressedrow.use; 854 855 PetscFunctionBegin; 856 PetscCall(VecGetArrayRead(xx,&x)); 857 PetscCall(VecGetArrayWrite(zz,&zarray)); 858 859 v = a->a; 860 if (usecprow) { 861 mbs = a->compressedrow.nrows; 862 ii = a->compressedrow.i; 863 ridx = a->compressedrow.rindex; 864 PetscCall(PetscArrayzero(zarray,12*a->mbs)); 865 } else { 866 mbs = a->mbs; 867 ii = a->i; 868 z = zarray; 869 } 870 871 for (i=0; i<mbs; i++) { 872 n = ii[i+1] - ii[i]; 873 idx = ij + ii[i]; 874 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 875 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; 876 877 for (j=0; j<n; j++) { 878 xb = x + 12*(idx[j]); 879 880 for (k=0; k<12; k++) { 881 xv = xb[k]; 882 sum1 += v[0]*xv; 883 sum2 += v[1]*xv; 884 sum3 += v[2]*xv; 885 sum4 += v[3]*xv; 886 sum5 += v[4]*xv; 887 sum6 += v[5]*xv; 888 sum7 += v[6]*xv; 889 sum8 += v[7]*xv; 890 sum9 += v[8]*xv; 891 sum10 += v[9]*xv; 892 sum11 += v[10]*xv; 893 sum12 += v[11]*xv; 894 v += 12; 895 } 896 } 897 if (usecprow) z = zarray + 12*ridx[i]; 898 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 899 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 900 if (!usecprow) z += 12; 901 } 902 PetscCall(VecRestoreArrayRead(xx,&x)); 903 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 904 PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt)); 905 PetscFunctionReturn(0); 906 } 907 908 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz) 909 { 910 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 911 PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 912 const PetscScalar *x,*xb; 913 PetscScalar *zarray,*yarray,xv; 914 const MatScalar *v; 915 const PetscInt *ii,*ij=a->j,*idx; 916 PetscInt mbs = a->mbs,i,j,k,n,*ridx=NULL; 917 PetscBool usecprow=a->compressedrow.use; 918 919 PetscFunctionBegin; 920 PetscCall(VecGetArrayRead(xx,&x)); 921 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 922 923 v = a->a; 924 if (usecprow) { 925 if (zz != yy) { 926 PetscCall(PetscArraycpy(zarray,yarray,12*mbs)); 927 } 928 mbs = a->compressedrow.nrows; 929 ii = a->compressedrow.i; 930 ridx = a->compressedrow.rindex; 931 } else { 932 ii = a->i; 933 y = yarray; 934 z = zarray; 935 } 936 937 for (i=0; i<mbs; i++) { 938 n = ii[i+1] - ii[i]; 939 idx = ij + ii[i]; 940 941 if (usecprow) { 942 y = yarray + 12*ridx[i]; 943 z = zarray + 12*ridx[i]; 944 } 945 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 946 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11]; 947 948 for (j=0; j<n; j++) { 949 xb = x + 12*(idx[j]); 950 951 for (k=0; k<12; k++) { 952 xv = xb[k]; 953 sum1 += v[0]*xv; 954 sum2 += v[1]*xv; 955 sum3 += v[2]*xv; 956 sum4 += v[3]*xv; 957 sum5 += v[4]*xv; 958 sum6 += v[5]*xv; 959 sum7 += v[6]*xv; 960 sum8 += v[7]*xv; 961 sum9 += v[8]*xv; 962 sum10 += v[9]*xv; 963 sum11 += v[10]*xv; 964 sum12 += v[11]*xv; 965 v += 12; 966 } 967 } 968 969 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 970 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 971 if (!usecprow) { 972 y += 12; 973 z += 12; 974 } 975 } 976 PetscCall(VecRestoreArrayRead(xx,&x)); 977 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 978 PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt)); 979 PetscFunctionReturn(0); 980 } 981 982 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 983 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz) 984 { 985 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 986 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 987 const PetscScalar *x,*xb; 988 PetscScalar x1,x2,x3,x4,*zarray; 989 const MatScalar *v; 990 const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL; 991 PetscInt mbs,i,j,n; 992 PetscBool usecprow=a->compressedrow.use; 993 994 PetscFunctionBegin; 995 PetscCall(VecGetArrayRead(xx,&x)); 996 PetscCall(VecGetArrayWrite(zz,&zarray)); 997 998 v = a->a; 999 if (usecprow) { 1000 mbs = a->compressedrow.nrows; 1001 ii = a->compressedrow.i; 1002 ridx = a->compressedrow.rindex; 1003 PetscCall(PetscArrayzero(zarray,12*a->mbs)); 1004 } else { 1005 mbs = a->mbs; 1006 ii = a->i; 1007 z = zarray; 1008 } 1009 1010 for (i=0; i<mbs; i++) { 1011 n = ii[i+1] - ii[i]; 1012 idx = ij + ii[i]; 1013 1014 sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 1015 for (j=0; j<n; j++) { 1016 xb = x + 12*(idx[j]); 1017 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1018 1019 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1020 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1021 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1022 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1023 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1024 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1025 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1026 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1027 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1028 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1029 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1030 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1031 v += 48; 1032 1033 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1034 1035 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1036 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1037 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1038 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1039 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1040 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1041 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1042 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1043 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1044 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1045 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1046 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1047 v += 48; 1048 1049 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1050 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1051 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1052 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1053 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1054 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1055 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1056 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1057 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1058 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1059 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1060 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1061 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1062 v += 48; 1063 1064 } 1065 if (usecprow) z = zarray + 12*ridx[i]; 1066 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1067 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 1068 if (!usecprow) z += 12; 1069 } 1070 PetscCall(VecRestoreArrayRead(xx,&x)); 1071 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1072 PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt)); 1073 PetscFunctionReturn(0); 1074 } 1075 1076 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1077 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz) 1078 { 1079 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1080 PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 1081 const PetscScalar *x,*xb; 1082 PetscScalar x1,x2,x3,x4,*zarray,*yarray; 1083 const MatScalar *v; 1084 const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL; 1085 PetscInt mbs = a->mbs,i,j,n; 1086 PetscBool usecprow=a->compressedrow.use; 1087 1088 PetscFunctionBegin; 1089 PetscCall(VecGetArrayRead(xx,&x)); 1090 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 1091 1092 v = a->a; 1093 if (usecprow) { 1094 if (zz != yy) { 1095 PetscCall(PetscArraycpy(zarray,yarray,12*mbs)); 1096 } 1097 mbs = a->compressedrow.nrows; 1098 ii = a->compressedrow.i; 1099 ridx = a->compressedrow.rindex; 1100 } else { 1101 ii = a->i; 1102 y = yarray; 1103 z = zarray; 1104 } 1105 1106 for (i=0; i<mbs; i++) { 1107 n = ii[i+1] - ii[i]; 1108 idx = ij + ii[i]; 1109 1110 if (usecprow) { 1111 y = yarray + 12*ridx[i]; 1112 z = zarray + 12*ridx[i]; 1113 } 1114 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1115 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11]; 1116 1117 for (j=0; j<n; j++) { 1118 xb = x + 12*(idx[j]); 1119 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1120 1121 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1122 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1123 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1124 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1125 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1126 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1127 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1128 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1129 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1130 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1131 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1132 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1133 v += 48; 1134 1135 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1136 1137 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1138 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1139 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1140 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1141 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1142 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1143 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1144 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1145 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1146 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1147 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1148 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1149 v += 48; 1150 1151 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1152 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1153 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1154 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1155 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1156 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1157 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1158 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1159 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1160 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1161 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1162 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1163 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1164 v += 48; 1165 1166 } 1167 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1168 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 1169 if (!usecprow) { 1170 y += 12; 1171 z += 12; 1172 } 1173 } 1174 PetscCall(VecRestoreArrayRead(xx,&x)); 1175 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 1176 PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt)); 1177 PetscFunctionReturn(0); 1178 } 1179 1180 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1181 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz) 1182 { 1183 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1184 PetscScalar *z = NULL,*zarray; 1185 const PetscScalar *x,*work; 1186 const MatScalar *v = a->a; 1187 PetscInt mbs,i,j,n; 1188 const PetscInt *idx = a->j,*ii,*ridx=NULL; 1189 PetscBool usecprow=a->compressedrow.use; 1190 const PetscInt bs = 12, bs2 = 144; 1191 1192 __m256d a0,a1,a2,a3,a4,a5; 1193 __m256d w0,w1,w2,w3; 1194 __m256d z0,z1,z2; 1195 1196 PetscFunctionBegin; 1197 PetscCall(VecGetArrayRead(xx,&x)); 1198 PetscCall(VecGetArrayWrite(zz,&zarray)); 1199 1200 if (usecprow) { 1201 mbs = a->compressedrow.nrows; 1202 ii = a->compressedrow.i; 1203 ridx = a->compressedrow.rindex; 1204 PetscCall(PetscArrayzero(zarray,bs*a->mbs)); 1205 } else { 1206 mbs = a->mbs; 1207 ii = a->i; 1208 z = zarray; 1209 } 1210 1211 for (i=0; i<mbs; i++) { 1212 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 1213 1214 n = ii[1] - ii[0]; ii++; 1215 for (j=0; j<n; j++) { 1216 work = x + bs*(*idx++); 1217 1218 /* first column of a */ 1219 w0 = _mm256_set1_pd(work[0]); 1220 a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0); 1221 a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1); 1222 a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2); 1223 1224 /* second column of a */ 1225 w1 = _mm256_set1_pd(work[1]); 1226 a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0); 1227 a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1); 1228 a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2); 1229 1230 /* third column of a */ 1231 w2 = _mm256_set1_pd(work[2]); 1232 a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0); 1233 a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1); 1234 a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2); 1235 1236 /* fourth column of a */ 1237 w3 = _mm256_set1_pd(work[3]); 1238 a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0); 1239 a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1); 1240 a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2); 1241 1242 /* fifth column of a */ 1243 w0 = _mm256_set1_pd(work[4]); 1244 a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0); 1245 a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1); 1246 a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2); 1247 1248 /* sixth column of a */ 1249 w1 = _mm256_set1_pd(work[5]); 1250 a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0); 1251 a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1); 1252 a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2); 1253 1254 /* seventh column of a */ 1255 w2 = _mm256_set1_pd(work[6]); 1256 a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0); 1257 a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1); 1258 a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2); 1259 1260 /* eighth column of a */ 1261 w3 = _mm256_set1_pd(work[7]); 1262 a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0); 1263 a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1); 1264 a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2); 1265 1266 /* ninth column of a */ 1267 w0 = _mm256_set1_pd(work[8]); 1268 a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0); 1269 a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1); 1270 a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2); 1271 1272 /* tenth column of a */ 1273 w1 = _mm256_set1_pd(work[9]); 1274 a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0); 1275 a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1); 1276 a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2); 1277 1278 /* eleventh column of a */ 1279 w2 = _mm256_set1_pd(work[10]); 1280 a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0); 1281 a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1); 1282 a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2); 1283 1284 /* twelveth column of a */ 1285 w3 = _mm256_set1_pd(work[11]); 1286 a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0); 1287 a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1); 1288 a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2); 1289 1290 v += bs2; 1291 } 1292 if (usecprow) z = zarray + bs*ridx[i]; 1293 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2); 1294 if (!usecprow) z += bs; 1295 } 1296 PetscCall(VecRestoreArrayRead(xx,&x)); 1297 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1298 PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt)); 1299 PetscFunctionReturn(0); 1300 } 1301 #endif 1302 1303 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1304 /* Default MatMult for block size 15 */ 1305 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 1306 { 1307 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1308 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1309 const PetscScalar *x,*xb; 1310 PetscScalar *zarray,xv; 1311 const MatScalar *v; 1312 const PetscInt *ii,*ij=a->j,*idx; 1313 PetscInt mbs,i,j,k,n,*ridx=NULL; 1314 PetscBool usecprow=a->compressedrow.use; 1315 1316 PetscFunctionBegin; 1317 PetscCall(VecGetArrayRead(xx,&x)); 1318 PetscCall(VecGetArrayWrite(zz,&zarray)); 1319 1320 v = a->a; 1321 if (usecprow) { 1322 mbs = a->compressedrow.nrows; 1323 ii = a->compressedrow.i; 1324 ridx = a->compressedrow.rindex; 1325 PetscCall(PetscArrayzero(zarray,15*a->mbs)); 1326 } else { 1327 mbs = a->mbs; 1328 ii = a->i; 1329 z = zarray; 1330 } 1331 1332 for (i=0; i<mbs; i++) { 1333 n = ii[i+1] - ii[i]; 1334 idx = ij + ii[i]; 1335 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1336 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1337 1338 for (j=0; j<n; j++) { 1339 xb = x + 15*(idx[j]); 1340 1341 for (k=0; k<15; k++) { 1342 xv = xb[k]; 1343 sum1 += v[0]*xv; 1344 sum2 += v[1]*xv; 1345 sum3 += v[2]*xv; 1346 sum4 += v[3]*xv; 1347 sum5 += v[4]*xv; 1348 sum6 += v[5]*xv; 1349 sum7 += v[6]*xv; 1350 sum8 += v[7]*xv; 1351 sum9 += v[8]*xv; 1352 sum10 += v[9]*xv; 1353 sum11 += v[10]*xv; 1354 sum12 += v[11]*xv; 1355 sum13 += v[12]*xv; 1356 sum14 += v[13]*xv; 1357 sum15 += v[14]*xv; 1358 v += 15; 1359 } 1360 } 1361 if (usecprow) z = zarray + 15*ridx[i]; 1362 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1363 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1364 1365 if (!usecprow) z += 15; 1366 } 1367 1368 PetscCall(VecRestoreArrayRead(xx,&x)); 1369 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1370 PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt)); 1371 PetscFunctionReturn(0); 1372 } 1373 1374 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1375 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 1376 { 1377 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1378 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1379 const PetscScalar *x,*xb; 1380 PetscScalar x1,x2,x3,x4,*zarray; 1381 const MatScalar *v; 1382 const PetscInt *ii,*ij=a->j,*idx; 1383 PetscInt mbs,i,j,n,*ridx=NULL; 1384 PetscBool usecprow=a->compressedrow.use; 1385 1386 PetscFunctionBegin; 1387 PetscCall(VecGetArrayRead(xx,&x)); 1388 PetscCall(VecGetArrayWrite(zz,&zarray)); 1389 1390 v = a->a; 1391 if (usecprow) { 1392 mbs = a->compressedrow.nrows; 1393 ii = a->compressedrow.i; 1394 ridx = a->compressedrow.rindex; 1395 PetscCall(PetscArrayzero(zarray,15*a->mbs)); 1396 } else { 1397 mbs = a->mbs; 1398 ii = a->i; 1399 z = zarray; 1400 } 1401 1402 for (i=0; i<mbs; i++) { 1403 n = ii[i+1] - ii[i]; 1404 idx = ij + ii[i]; 1405 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1406 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1407 1408 for (j=0; j<n; j++) { 1409 xb = x + 15*(idx[j]); 1410 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1411 1412 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1413 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1414 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1415 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1416 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1417 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1418 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1419 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1420 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1421 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1422 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1423 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1424 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1425 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1426 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1427 1428 v += 60; 1429 1430 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1431 1432 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1433 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1434 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1435 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1436 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1437 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1438 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1439 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1440 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1441 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1442 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1443 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1444 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1445 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1446 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1447 v += 60; 1448 1449 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1450 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1451 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1452 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1453 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1454 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1455 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1456 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1457 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1458 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1459 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1460 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1461 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1462 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1463 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1464 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1465 v += 60; 1466 1467 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1468 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1469 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1470 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1471 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1472 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1473 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1474 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1475 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1476 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1477 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1478 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1479 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1480 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1481 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1482 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1483 v += 45; 1484 } 1485 if (usecprow) z = zarray + 15*ridx[i]; 1486 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1487 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1488 1489 if (!usecprow) z += 15; 1490 } 1491 1492 PetscCall(VecRestoreArrayRead(xx,&x)); 1493 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1494 PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt)); 1495 PetscFunctionReturn(0); 1496 } 1497 1498 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1499 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1500 { 1501 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1502 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1503 const PetscScalar *x,*xb; 1504 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1505 const MatScalar *v; 1506 const PetscInt *ii,*ij=a->j,*idx; 1507 PetscInt mbs,i,j,n,*ridx=NULL; 1508 PetscBool usecprow=a->compressedrow.use; 1509 1510 PetscFunctionBegin; 1511 PetscCall(VecGetArrayRead(xx,&x)); 1512 PetscCall(VecGetArrayWrite(zz,&zarray)); 1513 1514 v = a->a; 1515 if (usecprow) { 1516 mbs = a->compressedrow.nrows; 1517 ii = a->compressedrow.i; 1518 ridx = a->compressedrow.rindex; 1519 PetscCall(PetscArrayzero(zarray,15*a->mbs)); 1520 } else { 1521 mbs = a->mbs; 1522 ii = a->i; 1523 z = zarray; 1524 } 1525 1526 for (i=0; i<mbs; i++) { 1527 n = ii[i+1] - ii[i]; 1528 idx = ij + ii[i]; 1529 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1530 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1531 1532 for (j=0; j<n; j++) { 1533 xb = x + 15*(idx[j]); 1534 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1535 x8 = xb[7]; 1536 1537 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1538 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1539 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1540 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1541 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1542 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1543 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1544 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1545 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1546 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1547 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1548 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1549 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1550 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1551 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1552 v += 120; 1553 1554 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1555 1556 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1557 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1558 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1559 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1560 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1561 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1562 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1563 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1564 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1565 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1566 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1567 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1568 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1569 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1570 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1571 v += 105; 1572 } 1573 if (usecprow) z = zarray + 15*ridx[i]; 1574 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1575 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1576 1577 if (!usecprow) z += 15; 1578 } 1579 1580 PetscCall(VecRestoreArrayRead(xx,&x)); 1581 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1582 PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt)); 1583 PetscFunctionReturn(0); 1584 } 1585 1586 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1587 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1588 { 1589 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1590 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1591 const PetscScalar *x,*xb; 1592 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1593 const MatScalar *v; 1594 const PetscInt *ii,*ij=a->j,*idx; 1595 PetscInt mbs,i,j,n,*ridx=NULL; 1596 PetscBool usecprow=a->compressedrow.use; 1597 1598 PetscFunctionBegin; 1599 PetscCall(VecGetArrayRead(xx,&x)); 1600 PetscCall(VecGetArrayWrite(zz,&zarray)); 1601 1602 v = a->a; 1603 if (usecprow) { 1604 mbs = a->compressedrow.nrows; 1605 ii = a->compressedrow.i; 1606 ridx = a->compressedrow.rindex; 1607 PetscCall(PetscArrayzero(zarray,15*a->mbs)); 1608 } else { 1609 mbs = a->mbs; 1610 ii = a->i; 1611 z = zarray; 1612 } 1613 1614 for (i=0; i<mbs; i++) { 1615 n = ii[i+1] - ii[i]; 1616 idx = ij + ii[i]; 1617 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1618 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1619 1620 for (j=0; j<n; j++) { 1621 xb = x + 15*(idx[j]); 1622 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1623 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1624 1625 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1626 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1627 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1628 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1629 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1630 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1631 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1632 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1633 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1634 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1635 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1636 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1637 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1638 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1639 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1640 v += 225; 1641 } 1642 if (usecprow) z = zarray + 15*ridx[i]; 1643 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1644 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1645 1646 if (!usecprow) z += 15; 1647 } 1648 1649 PetscCall(VecRestoreArrayRead(xx,&x)); 1650 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1651 PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt)); 1652 PetscFunctionReturn(0); 1653 } 1654 1655 /* 1656 This will not work with MatScalar == float because it calls the BLAS 1657 */ 1658 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1659 { 1660 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1661 PetscScalar *z = NULL,*work,*workt,*zarray; 1662 const PetscScalar *x,*xb; 1663 const MatScalar *v; 1664 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1665 const PetscInt *idx,*ii,*ridx=NULL; 1666 PetscInt ncols,k; 1667 PetscBool usecprow=a->compressedrow.use; 1668 1669 PetscFunctionBegin; 1670 PetscCall(VecGetArrayRead(xx,&x)); 1671 PetscCall(VecGetArrayWrite(zz,&zarray)); 1672 1673 idx = a->j; 1674 v = a->a; 1675 if (usecprow) { 1676 mbs = a->compressedrow.nrows; 1677 ii = a->compressedrow.i; 1678 ridx = a->compressedrow.rindex; 1679 PetscCall(PetscArrayzero(zarray,bs*a->mbs)); 1680 } else { 1681 mbs = a->mbs; 1682 ii = a->i; 1683 z = zarray; 1684 } 1685 1686 if (!a->mult_work) { 1687 k = PetscMax(A->rmap->n,A->cmap->n); 1688 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 1689 } 1690 work = a->mult_work; 1691 for (i=0; i<mbs; i++) { 1692 n = ii[1] - ii[0]; ii++; 1693 ncols = n*bs; 1694 workt = work; 1695 for (j=0; j<n; j++) { 1696 xb = x + bs*(*idx++); 1697 for (k=0; k<bs; k++) workt[k] = xb[k]; 1698 workt += bs; 1699 } 1700 if (usecprow) z = zarray + bs*ridx[i]; 1701 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1702 v += n*bs2; 1703 if (!usecprow) z += bs; 1704 } 1705 PetscCall(VecRestoreArrayRead(xx,&x)); 1706 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1707 PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt)); 1708 PetscFunctionReturn(0); 1709 } 1710 1711 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1712 { 1713 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1714 const PetscScalar *x; 1715 PetscScalar *y,*z,sum; 1716 const MatScalar *v; 1717 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1718 const PetscInt *idx,*ii; 1719 PetscBool usecprow=a->compressedrow.use; 1720 1721 PetscFunctionBegin; 1722 PetscCall(VecGetArrayRead(xx,&x)); 1723 PetscCall(VecGetArrayPair(yy,zz,&y,&z)); 1724 1725 idx = a->j; 1726 v = a->a; 1727 if (usecprow) { 1728 if (zz != yy) { 1729 PetscCall(PetscArraycpy(z,y,mbs)); 1730 } 1731 mbs = a->compressedrow.nrows; 1732 ii = a->compressedrow.i; 1733 ridx = a->compressedrow.rindex; 1734 } else { 1735 ii = a->i; 1736 } 1737 1738 for (i=0; i<mbs; i++) { 1739 n = ii[1] - ii[0]; 1740 ii++; 1741 if (!usecprow) { 1742 sum = y[i]; 1743 } else { 1744 sum = y[ridx[i]]; 1745 } 1746 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1747 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1748 PetscSparseDensePlusDot(sum,x,v,idx,n); 1749 v += n; 1750 idx += n; 1751 if (usecprow) { 1752 z[ridx[i]] = sum; 1753 } else { 1754 z[i] = sum; 1755 } 1756 } 1757 PetscCall(VecRestoreArrayRead(xx,&x)); 1758 PetscCall(VecRestoreArrayPair(yy,zz,&y,&z)); 1759 PetscCall(PetscLogFlops(2.0*a->nz)); 1760 PetscFunctionReturn(0); 1761 } 1762 1763 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1764 { 1765 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1766 PetscScalar *y = NULL,*z = NULL,sum1,sum2; 1767 const PetscScalar *x,*xb; 1768 PetscScalar x1,x2,*yarray,*zarray; 1769 const MatScalar *v; 1770 PetscInt mbs = a->mbs,i,n,j; 1771 const PetscInt *idx,*ii,*ridx = NULL; 1772 PetscBool usecprow = a->compressedrow.use; 1773 1774 PetscFunctionBegin; 1775 PetscCall(VecGetArrayRead(xx,&x)); 1776 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 1777 1778 idx = a->j; 1779 v = a->a; 1780 if (usecprow) { 1781 if (zz != yy) { 1782 PetscCall(PetscArraycpy(zarray,yarray,2*mbs)); 1783 } 1784 mbs = a->compressedrow.nrows; 1785 ii = a->compressedrow.i; 1786 ridx = a->compressedrow.rindex; 1787 } else { 1788 ii = a->i; 1789 y = yarray; 1790 z = zarray; 1791 } 1792 1793 for (i=0; i<mbs; i++) { 1794 n = ii[1] - ii[0]; ii++; 1795 if (usecprow) { 1796 z = zarray + 2*ridx[i]; 1797 y = yarray + 2*ridx[i]; 1798 } 1799 sum1 = y[0]; sum2 = y[1]; 1800 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1801 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1802 for (j=0; j<n; j++) { 1803 xb = x + 2*(*idx++); 1804 x1 = xb[0]; 1805 x2 = xb[1]; 1806 1807 sum1 += v[0]*x1 + v[2]*x2; 1808 sum2 += v[1]*x1 + v[3]*x2; 1809 v += 4; 1810 } 1811 z[0] = sum1; z[1] = sum2; 1812 if (!usecprow) { 1813 z += 2; y += 2; 1814 } 1815 } 1816 PetscCall(VecRestoreArrayRead(xx,&x)); 1817 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 1818 PetscCall(PetscLogFlops(4.0*a->nz)); 1819 PetscFunctionReturn(0); 1820 } 1821 1822 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1823 { 1824 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1825 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1826 const PetscScalar *x,*xb; 1827 const MatScalar *v; 1828 PetscInt mbs = a->mbs,i,j,n; 1829 const PetscInt *idx,*ii,*ridx = NULL; 1830 PetscBool usecprow = a->compressedrow.use; 1831 1832 PetscFunctionBegin; 1833 PetscCall(VecGetArrayRead(xx,&x)); 1834 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 1835 1836 idx = a->j; 1837 v = a->a; 1838 if (usecprow) { 1839 if (zz != yy) { 1840 PetscCall(PetscArraycpy(zarray,yarray,3*mbs)); 1841 } 1842 mbs = a->compressedrow.nrows; 1843 ii = a->compressedrow.i; 1844 ridx = a->compressedrow.rindex; 1845 } else { 1846 ii = a->i; 1847 y = yarray; 1848 z = zarray; 1849 } 1850 1851 for (i=0; i<mbs; i++) { 1852 n = ii[1] - ii[0]; ii++; 1853 if (usecprow) { 1854 z = zarray + 3*ridx[i]; 1855 y = yarray + 3*ridx[i]; 1856 } 1857 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1858 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1859 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1860 for (j=0; j<n; j++) { 1861 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1862 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1863 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1864 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1865 v += 9; 1866 } 1867 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1868 if (!usecprow) { 1869 z += 3; y += 3; 1870 } 1871 } 1872 PetscCall(VecRestoreArrayRead(xx,&x)); 1873 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 1874 PetscCall(PetscLogFlops(18.0*a->nz)); 1875 PetscFunctionReturn(0); 1876 } 1877 1878 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1879 { 1880 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1881 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1882 const PetscScalar *x,*xb; 1883 const MatScalar *v; 1884 PetscInt mbs = a->mbs,i,j,n; 1885 const PetscInt *idx,*ii,*ridx=NULL; 1886 PetscBool usecprow=a->compressedrow.use; 1887 1888 PetscFunctionBegin; 1889 PetscCall(VecGetArrayRead(xx,&x)); 1890 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 1891 1892 idx = a->j; 1893 v = a->a; 1894 if (usecprow) { 1895 if (zz != yy) { 1896 PetscCall(PetscArraycpy(zarray,yarray,4*mbs)); 1897 } 1898 mbs = a->compressedrow.nrows; 1899 ii = a->compressedrow.i; 1900 ridx = a->compressedrow.rindex; 1901 } else { 1902 ii = a->i; 1903 y = yarray; 1904 z = zarray; 1905 } 1906 1907 for (i=0; i<mbs; i++) { 1908 n = ii[1] - ii[0]; ii++; 1909 if (usecprow) { 1910 z = zarray + 4*ridx[i]; 1911 y = yarray + 4*ridx[i]; 1912 } 1913 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1914 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1915 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1916 for (j=0; j<n; j++) { 1917 xb = x + 4*(*idx++); 1918 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1919 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1920 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1921 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1922 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1923 v += 16; 1924 } 1925 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1926 if (!usecprow) { 1927 z += 4; y += 4; 1928 } 1929 } 1930 PetscCall(VecRestoreArrayRead(xx,&x)); 1931 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 1932 PetscCall(PetscLogFlops(32.0*a->nz)); 1933 PetscFunctionReturn(0); 1934 } 1935 1936 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1937 { 1938 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1939 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1940 const PetscScalar *x,*xb; 1941 PetscScalar *yarray,*zarray; 1942 const MatScalar *v; 1943 PetscInt mbs = a->mbs,i,j,n; 1944 const PetscInt *idx,*ii,*ridx = NULL; 1945 PetscBool usecprow=a->compressedrow.use; 1946 1947 PetscFunctionBegin; 1948 PetscCall(VecGetArrayRead(xx,&x)); 1949 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 1950 1951 idx = a->j; 1952 v = a->a; 1953 if (usecprow) { 1954 if (zz != yy) { 1955 PetscCall(PetscArraycpy(zarray,yarray,5*mbs)); 1956 } 1957 mbs = a->compressedrow.nrows; 1958 ii = a->compressedrow.i; 1959 ridx = a->compressedrow.rindex; 1960 } else { 1961 ii = a->i; 1962 y = yarray; 1963 z = zarray; 1964 } 1965 1966 for (i=0; i<mbs; i++) { 1967 n = ii[1] - ii[0]; ii++; 1968 if (usecprow) { 1969 z = zarray + 5*ridx[i]; 1970 y = yarray + 5*ridx[i]; 1971 } 1972 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1973 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1974 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1975 for (j=0; j<n; j++) { 1976 xb = x + 5*(*idx++); 1977 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1978 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1979 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1980 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1981 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1982 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1983 v += 25; 1984 } 1985 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1986 if (!usecprow) { 1987 z += 5; y += 5; 1988 } 1989 } 1990 PetscCall(VecRestoreArrayRead(xx,&x)); 1991 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 1992 PetscCall(PetscLogFlops(50.0*a->nz)); 1993 PetscFunctionReturn(0); 1994 } 1995 1996 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1997 { 1998 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1999 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; 2000 const PetscScalar *x,*xb; 2001 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 2002 const MatScalar *v; 2003 PetscInt mbs = a->mbs,i,j,n; 2004 const PetscInt *idx,*ii,*ridx=NULL; 2005 PetscBool usecprow=a->compressedrow.use; 2006 2007 PetscFunctionBegin; 2008 PetscCall(VecGetArrayRead(xx,&x)); 2009 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 2010 2011 idx = a->j; 2012 v = a->a; 2013 if (usecprow) { 2014 if (zz != yy) { 2015 PetscCall(PetscArraycpy(zarray,yarray,6*mbs)); 2016 } 2017 mbs = a->compressedrow.nrows; 2018 ii = a->compressedrow.i; 2019 ridx = a->compressedrow.rindex; 2020 } else { 2021 ii = a->i; 2022 y = yarray; 2023 z = zarray; 2024 } 2025 2026 for (i=0; i<mbs; i++) { 2027 n = ii[1] - ii[0]; ii++; 2028 if (usecprow) { 2029 z = zarray + 6*ridx[i]; 2030 y = yarray + 6*ridx[i]; 2031 } 2032 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 2033 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2034 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2035 for (j=0; j<n; j++) { 2036 xb = x + 6*(*idx++); 2037 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 2038 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2039 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2040 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2041 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2042 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2043 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2044 v += 36; 2045 } 2046 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 2047 if (!usecprow) { 2048 z += 6; y += 6; 2049 } 2050 } 2051 PetscCall(VecRestoreArrayRead(xx,&x)); 2052 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 2053 PetscCall(PetscLogFlops(72.0*a->nz)); 2054 PetscFunctionReturn(0); 2055 } 2056 2057 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 2058 { 2059 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2060 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 2061 const PetscScalar *x,*xb; 2062 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 2063 const MatScalar *v; 2064 PetscInt mbs = a->mbs,i,j,n; 2065 const PetscInt *idx,*ii,*ridx = NULL; 2066 PetscBool usecprow=a->compressedrow.use; 2067 2068 PetscFunctionBegin; 2069 PetscCall(VecGetArrayRead(xx,&x)); 2070 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 2071 2072 idx = a->j; 2073 v = a->a; 2074 if (usecprow) { 2075 if (zz != yy) { 2076 PetscCall(PetscArraycpy(zarray,yarray,7*mbs)); 2077 } 2078 mbs = a->compressedrow.nrows; 2079 ii = a->compressedrow.i; 2080 ridx = a->compressedrow.rindex; 2081 } else { 2082 ii = a->i; 2083 y = yarray; 2084 z = zarray; 2085 } 2086 2087 for (i=0; i<mbs; i++) { 2088 n = ii[1] - ii[0]; ii++; 2089 if (usecprow) { 2090 z = zarray + 7*ridx[i]; 2091 y = yarray + 7*ridx[i]; 2092 } 2093 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2094 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2095 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2096 for (j=0; j<n; j++) { 2097 xb = x + 7*(*idx++); 2098 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 2099 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2100 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2101 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2102 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2103 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2104 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2105 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2106 v += 49; 2107 } 2108 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2109 if (!usecprow) { 2110 z += 7; y += 7; 2111 } 2112 } 2113 PetscCall(VecRestoreArrayRead(xx,&x)); 2114 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 2115 PetscCall(PetscLogFlops(98.0*a->nz)); 2116 PetscFunctionReturn(0); 2117 } 2118 2119 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2120 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz) 2121 { 2122 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2123 PetscScalar *z = NULL,*work,*workt,*zarray; 2124 const PetscScalar *x,*xb; 2125 const MatScalar *v; 2126 PetscInt mbs,i,j,n; 2127 PetscInt k; 2128 PetscBool usecprow=a->compressedrow.use; 2129 const PetscInt *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81; 2130 2131 __m256d a0,a1,a2,a3,a4,a5; 2132 __m256d w0,w1,w2,w3; 2133 __m256d z0,z1,z2; 2134 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 2135 2136 PetscFunctionBegin; 2137 PetscCall(VecCopy(yy,zz)); 2138 PetscCall(VecGetArrayRead(xx,&x)); 2139 PetscCall(VecGetArray(zz,&zarray)); 2140 2141 idx = a->j; 2142 v = a->a; 2143 if (usecprow) { 2144 mbs = a->compressedrow.nrows; 2145 ii = a->compressedrow.i; 2146 ridx = a->compressedrow.rindex; 2147 } else { 2148 mbs = a->mbs; 2149 ii = a->i; 2150 z = zarray; 2151 } 2152 2153 if (!a->mult_work) { 2154 k = PetscMax(A->rmap->n,A->cmap->n); 2155 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2156 } 2157 2158 work = a->mult_work; 2159 for (i=0; i<mbs; i++) { 2160 n = ii[1] - ii[0]; ii++; 2161 workt = work; 2162 for (j=0; j<n; j++) { 2163 xb = x + bs*(*idx++); 2164 for (k=0; k<bs; k++) workt[k] = xb[k]; 2165 workt += bs; 2166 } 2167 if (usecprow) z = zarray + bs*ridx[i]; 2168 2169 z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]); 2170 2171 for (j=0; j<n; j++) { 2172 /* first column of a */ 2173 w0 = _mm256_set1_pd(work[j*9 ]); 2174 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 2175 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 2176 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 2177 2178 /* second column of a */ 2179 w1 = _mm256_set1_pd(work[j*9+ 1]); 2180 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 2181 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 2182 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 2183 2184 /* third column of a */ 2185 w2 = _mm256_set1_pd(work[j*9+ 2]); 2186 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 2187 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 2188 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 2189 2190 /* fourth column of a */ 2191 w3 = _mm256_set1_pd(work[j*9+ 3]); 2192 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 2193 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 2194 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 2195 2196 /* fifth column of a */ 2197 w0 = _mm256_set1_pd(work[j*9+ 4]); 2198 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 2199 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 2200 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 2201 2202 /* sixth column of a */ 2203 w1 = _mm256_set1_pd(work[j*9+ 5]); 2204 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 2205 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 2206 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 2207 2208 /* seventh column of a */ 2209 w2 = _mm256_set1_pd(work[j*9+ 6]); 2210 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 2211 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 2212 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 2213 2214 /* eighth column of a */ 2215 w3 = _mm256_set1_pd(work[j*9+ 7]); 2216 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 2217 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 2218 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 2219 2220 /* ninth column of a */ 2221 w0 = _mm256_set1_pd(work[j*9+ 8]); 2222 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 2223 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 2224 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 2225 } 2226 2227 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 2228 2229 v += n*bs2; 2230 if (!usecprow) z += bs; 2231 } 2232 PetscCall(VecRestoreArrayRead(xx,&x)); 2233 PetscCall(VecRestoreArray(zz,&zarray)); 2234 PetscCall(PetscLogFlops(162.0*a->nz)); 2235 PetscFunctionReturn(0); 2236 } 2237 #endif 2238 2239 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz) 2240 { 2241 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2242 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 2243 const PetscScalar *x,*xb; 2244 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray; 2245 const MatScalar *v; 2246 PetscInt mbs = a->mbs,i,j,n; 2247 const PetscInt *idx,*ii,*ridx = NULL; 2248 PetscBool usecprow=a->compressedrow.use; 2249 2250 PetscFunctionBegin; 2251 PetscCall(VecGetArrayRead(xx,&x)); 2252 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 2253 2254 idx = a->j; 2255 v = a->a; 2256 if (usecprow) { 2257 if (zz != yy) { 2258 PetscCall(PetscArraycpy(zarray,yarray,7*mbs)); 2259 } 2260 mbs = a->compressedrow.nrows; 2261 ii = a->compressedrow.i; 2262 ridx = a->compressedrow.rindex; 2263 } else { 2264 ii = a->i; 2265 y = yarray; 2266 z = zarray; 2267 } 2268 2269 for (i=0; i<mbs; i++) { 2270 n = ii[1] - ii[0]; ii++; 2271 if (usecprow) { 2272 z = zarray + 11*ridx[i]; 2273 y = yarray + 11*ridx[i]; 2274 } 2275 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2276 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; 2277 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2278 PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2279 for (j=0; j<n; j++) { 2280 xb = x + 11*(*idx++); 2281 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; 2282 sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11; 2283 sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11; 2284 sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11; 2285 sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11; 2286 sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11; 2287 sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11; 2288 sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11; 2289 sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11; 2290 sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11; 2291 sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11; 2292 sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11; 2293 v += 121; 2294 } 2295 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2296 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 2297 if (!usecprow) { 2298 z += 11; y += 11; 2299 } 2300 } 2301 PetscCall(VecRestoreArrayRead(xx,&x)); 2302 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 2303 PetscCall(PetscLogFlops(242.0*a->nz)); 2304 PetscFunctionReturn(0); 2305 } 2306 2307 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 2308 { 2309 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2310 PetscScalar *z = NULL,*work,*workt,*zarray; 2311 const PetscScalar *x,*xb; 2312 const MatScalar *v; 2313 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 2314 PetscInt ncols,k; 2315 const PetscInt *ridx = NULL,*idx,*ii; 2316 PetscBool usecprow = a->compressedrow.use; 2317 2318 PetscFunctionBegin; 2319 PetscCall(VecCopy(yy,zz)); 2320 PetscCall(VecGetArrayRead(xx,&x)); 2321 PetscCall(VecGetArray(zz,&zarray)); 2322 2323 idx = a->j; 2324 v = a->a; 2325 if (usecprow) { 2326 mbs = a->compressedrow.nrows; 2327 ii = a->compressedrow.i; 2328 ridx = a->compressedrow.rindex; 2329 } else { 2330 mbs = a->mbs; 2331 ii = a->i; 2332 z = zarray; 2333 } 2334 2335 if (!a->mult_work) { 2336 k = PetscMax(A->rmap->n,A->cmap->n); 2337 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2338 } 2339 work = a->mult_work; 2340 for (i=0; i<mbs; i++) { 2341 n = ii[1] - ii[0]; ii++; 2342 ncols = n*bs; 2343 workt = work; 2344 for (j=0; j<n; j++) { 2345 xb = x + bs*(*idx++); 2346 for (k=0; k<bs; k++) workt[k] = xb[k]; 2347 workt += bs; 2348 } 2349 if (usecprow) z = zarray + bs*ridx[i]; 2350 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 2351 v += n*bs2; 2352 if (!usecprow) z += bs; 2353 } 2354 PetscCall(VecRestoreArrayRead(xx,&x)); 2355 PetscCall(VecRestoreArray(zz,&zarray)); 2356 PetscCall(PetscLogFlops(2.0*a->nz*bs2)); 2357 PetscFunctionReturn(0); 2358 } 2359 2360 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2361 { 2362 PetscScalar zero = 0.0; 2363 2364 PetscFunctionBegin; 2365 PetscCall(VecSet(zz,zero)); 2366 PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz)); 2367 PetscFunctionReturn(0); 2368 } 2369 2370 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2371 { 2372 PetscScalar zero = 0.0; 2373 2374 PetscFunctionBegin; 2375 PetscCall(VecSet(zz,zero)); 2376 PetscCall(MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz)); 2377 PetscFunctionReturn(0); 2378 } 2379 2380 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2381 { 2382 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2383 PetscScalar *z,x1,x2,x3,x4,x5; 2384 const PetscScalar *x,*xb = NULL; 2385 const MatScalar *v; 2386 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 2387 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2388 Mat_CompressedRow cprow = a->compressedrow; 2389 PetscBool usecprow = cprow.use; 2390 2391 PetscFunctionBegin; 2392 if (yy != zz) PetscCall(VecCopy(yy,zz)); 2393 PetscCall(VecGetArrayRead(xx,&x)); 2394 PetscCall(VecGetArray(zz,&z)); 2395 2396 idx = a->j; 2397 v = a->a; 2398 if (usecprow) { 2399 mbs = cprow.nrows; 2400 ii = cprow.i; 2401 ridx = cprow.rindex; 2402 } else { 2403 mbs=a->mbs; 2404 ii = a->i; 2405 xb = x; 2406 } 2407 2408 switch (bs) { 2409 case 1: 2410 for (i=0; i<mbs; i++) { 2411 if (usecprow) xb = x + ridx[i]; 2412 x1 = xb[0]; 2413 ib = idx + ii[0]; 2414 n = ii[1] - ii[0]; ii++; 2415 for (j=0; j<n; j++) { 2416 rval = ib[j]; 2417 z[rval] += PetscConj(*v) * x1; 2418 v++; 2419 } 2420 if (!usecprow) xb++; 2421 } 2422 break; 2423 case 2: 2424 for (i=0; i<mbs; i++) { 2425 if (usecprow) xb = x + 2*ridx[i]; 2426 x1 = xb[0]; x2 = xb[1]; 2427 ib = idx + ii[0]; 2428 n = ii[1] - ii[0]; ii++; 2429 for (j=0; j<n; j++) { 2430 rval = ib[j]*2; 2431 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2432 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2433 v += 4; 2434 } 2435 if (!usecprow) xb += 2; 2436 } 2437 break; 2438 case 3: 2439 for (i=0; i<mbs; i++) { 2440 if (usecprow) xb = x + 3*ridx[i]; 2441 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2442 ib = idx + ii[0]; 2443 n = ii[1] - ii[0]; ii++; 2444 for (j=0; j<n; j++) { 2445 rval = ib[j]*3; 2446 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2447 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2448 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2449 v += 9; 2450 } 2451 if (!usecprow) xb += 3; 2452 } 2453 break; 2454 case 4: 2455 for (i=0; i<mbs; i++) { 2456 if (usecprow) xb = x + 4*ridx[i]; 2457 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2458 ib = idx + ii[0]; 2459 n = ii[1] - ii[0]; ii++; 2460 for (j=0; j<n; j++) { 2461 rval = ib[j]*4; 2462 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2463 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2464 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2465 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2466 v += 16; 2467 } 2468 if (!usecprow) xb += 4; 2469 } 2470 break; 2471 case 5: 2472 for (i=0; i<mbs; i++) { 2473 if (usecprow) xb = x + 5*ridx[i]; 2474 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2475 x4 = xb[3]; x5 = xb[4]; 2476 ib = idx + ii[0]; 2477 n = ii[1] - ii[0]; ii++; 2478 for (j=0; j<n; j++) { 2479 rval = ib[j]*5; 2480 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2481 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2482 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2483 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2484 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2485 v += 25; 2486 } 2487 if (!usecprow) xb += 5; 2488 } 2489 break; 2490 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2491 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2492 #if 0 2493 { 2494 PetscInt ncols,k,bs2=a->bs2; 2495 PetscScalar *work,*workt,zb; 2496 const PetscScalar *xtmp; 2497 if (!a->mult_work) { 2498 k = PetscMax(A->rmap->n,A->cmap->n); 2499 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2500 } 2501 work = a->mult_work; 2502 xtmp = x; 2503 for (i=0; i<mbs; i++) { 2504 n = ii[1] - ii[0]; ii++; 2505 ncols = n*bs; 2506 PetscCall(PetscArrayzero(work,ncols)); 2507 if (usecprow) xtmp = x + bs*ridx[i]; 2508 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2509 v += n*bs2; 2510 if (!usecprow) xtmp += bs; 2511 workt = work; 2512 for (j=0; j<n; j++) { 2513 zb = z + bs*(*idx++); 2514 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2515 workt += bs; 2516 } 2517 } 2518 } 2519 #endif 2520 } 2521 PetscCall(VecRestoreArrayRead(xx,&x)); 2522 PetscCall(VecRestoreArray(zz,&z)); 2523 PetscCall(PetscLogFlops(2.0*a->nz*a->bs2)); 2524 PetscFunctionReturn(0); 2525 } 2526 2527 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2528 { 2529 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2530 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2531 const PetscScalar *x,*xb = NULL; 2532 const MatScalar *v; 2533 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2534 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2535 Mat_CompressedRow cprow = a->compressedrow; 2536 PetscBool usecprow=cprow.use; 2537 2538 PetscFunctionBegin; 2539 if (yy != zz) PetscCall(VecCopy(yy,zz)); 2540 PetscCall(VecGetArrayRead(xx,&x)); 2541 PetscCall(VecGetArray(zz,&z)); 2542 2543 idx = a->j; 2544 v = a->a; 2545 if (usecprow) { 2546 mbs = cprow.nrows; 2547 ii = cprow.i; 2548 ridx = cprow.rindex; 2549 } else { 2550 mbs=a->mbs; 2551 ii = a->i; 2552 xb = x; 2553 } 2554 2555 switch (bs) { 2556 case 1: 2557 for (i=0; i<mbs; i++) { 2558 if (usecprow) xb = x + ridx[i]; 2559 x1 = xb[0]; 2560 ib = idx + ii[0]; 2561 n = ii[1] - ii[0]; ii++; 2562 for (j=0; j<n; j++) { 2563 rval = ib[j]; 2564 z[rval] += *v * x1; 2565 v++; 2566 } 2567 if (!usecprow) xb++; 2568 } 2569 break; 2570 case 2: 2571 for (i=0; i<mbs; i++) { 2572 if (usecprow) xb = x + 2*ridx[i]; 2573 x1 = xb[0]; x2 = xb[1]; 2574 ib = idx + ii[0]; 2575 n = ii[1] - ii[0]; ii++; 2576 for (j=0; j<n; j++) { 2577 rval = ib[j]*2; 2578 z[rval++] += v[0]*x1 + v[1]*x2; 2579 z[rval++] += v[2]*x1 + v[3]*x2; 2580 v += 4; 2581 } 2582 if (!usecprow) xb += 2; 2583 } 2584 break; 2585 case 3: 2586 for (i=0; i<mbs; i++) { 2587 if (usecprow) xb = x + 3*ridx[i]; 2588 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2589 ib = idx + ii[0]; 2590 n = ii[1] - ii[0]; ii++; 2591 for (j=0; j<n; j++) { 2592 rval = ib[j]*3; 2593 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2594 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2595 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2596 v += 9; 2597 } 2598 if (!usecprow) xb += 3; 2599 } 2600 break; 2601 case 4: 2602 for (i=0; i<mbs; i++) { 2603 if (usecprow) xb = x + 4*ridx[i]; 2604 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2605 ib = idx + ii[0]; 2606 n = ii[1] - ii[0]; ii++; 2607 for (j=0; j<n; j++) { 2608 rval = ib[j]*4; 2609 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2610 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2611 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2612 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2613 v += 16; 2614 } 2615 if (!usecprow) xb += 4; 2616 } 2617 break; 2618 case 5: 2619 for (i=0; i<mbs; i++) { 2620 if (usecprow) xb = x + 5*ridx[i]; 2621 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2622 x4 = xb[3]; x5 = xb[4]; 2623 ib = idx + ii[0]; 2624 n = ii[1] - ii[0]; ii++; 2625 for (j=0; j<n; j++) { 2626 rval = ib[j]*5; 2627 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2628 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2629 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2630 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2631 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2632 v += 25; 2633 } 2634 if (!usecprow) xb += 5; 2635 } 2636 break; 2637 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2638 PetscInt ncols,k; 2639 PetscScalar *work,*workt; 2640 const PetscScalar *xtmp; 2641 if (!a->mult_work) { 2642 k = PetscMax(A->rmap->n,A->cmap->n); 2643 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2644 } 2645 work = a->mult_work; 2646 xtmp = x; 2647 for (i=0; i<mbs; i++) { 2648 n = ii[1] - ii[0]; ii++; 2649 ncols = n*bs; 2650 PetscCall(PetscArrayzero(work,ncols)); 2651 if (usecprow) xtmp = x + bs*ridx[i]; 2652 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2653 v += n*bs2; 2654 if (!usecprow) xtmp += bs; 2655 workt = work; 2656 for (j=0; j<n; j++) { 2657 zb = z + bs*(*idx++); 2658 for (k=0; k<bs; k++) zb[k] += workt[k]; 2659 workt += bs; 2660 } 2661 } 2662 } 2663 } 2664 PetscCall(VecRestoreArrayRead(xx,&x)); 2665 PetscCall(VecRestoreArray(zz,&z)); 2666 PetscCall(PetscLogFlops(2.0*a->nz*a->bs2)); 2667 PetscFunctionReturn(0); 2668 } 2669 2670 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2671 { 2672 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2673 PetscInt totalnz = a->bs2*a->nz; 2674 PetscScalar oalpha = alpha; 2675 PetscBLASInt one = 1,tnz; 2676 2677 PetscFunctionBegin; 2678 PetscCall(PetscBLASIntCast(totalnz,&tnz)); 2679 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2680 PetscCall(PetscLogFlops(totalnz)); 2681 PetscFunctionReturn(0); 2682 } 2683 2684 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2685 { 2686 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2687 MatScalar *v = a->a; 2688 PetscReal sum = 0.0; 2689 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2690 2691 PetscFunctionBegin; 2692 if (type == NORM_FROBENIUS) { 2693 #if defined(PETSC_USE_REAL___FP16) 2694 PetscBLASInt one = 1,cnt = bs2*nz; 2695 PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one)); 2696 #else 2697 for (i=0; i<bs2*nz; i++) { 2698 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2699 } 2700 #endif 2701 *norm = PetscSqrtReal(sum); 2702 PetscCall(PetscLogFlops(2.0*bs2*nz)); 2703 } else if (type == NORM_1) { /* maximum column sum */ 2704 PetscReal *tmp; 2705 PetscInt *bcol = a->j; 2706 PetscCall(PetscCalloc1(A->cmap->n+1,&tmp)); 2707 for (i=0; i<nz; i++) { 2708 for (j=0; j<bs; j++) { 2709 k1 = bs*(*bcol) + j; /* column index */ 2710 for (k=0; k<bs; k++) { 2711 tmp[k1] += PetscAbsScalar(*v); v++; 2712 } 2713 } 2714 bcol++; 2715 } 2716 *norm = 0.0; 2717 for (j=0; j<A->cmap->n; j++) { 2718 if (tmp[j] > *norm) *norm = tmp[j]; 2719 } 2720 PetscCall(PetscFree(tmp)); 2721 PetscCall(PetscLogFlops(PetscMax(bs2*nz-1,0))); 2722 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2723 *norm = 0.0; 2724 for (k=0; k<bs; k++) { 2725 for (j=0; j<a->mbs; j++) { 2726 v = a->a + bs2*a->i[j] + k; 2727 sum = 0.0; 2728 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2729 for (k1=0; k1<bs; k1++) { 2730 sum += PetscAbsScalar(*v); 2731 v += bs; 2732 } 2733 } 2734 if (sum > *norm) *norm = sum; 2735 } 2736 } 2737 PetscCall(PetscLogFlops(PetscMax(bs2*nz-1,0))); 2738 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2739 PetscFunctionReturn(0); 2740 } 2741 2742 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2743 { 2744 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2745 2746 PetscFunctionBegin; 2747 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2748 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2749 *flg = PETSC_FALSE; 2750 PetscFunctionReturn(0); 2751 } 2752 2753 /* if the a->i are the same */ 2754 PetscCall(PetscArraycmp(a->i,b->i,a->mbs+1,flg)); 2755 if (!*flg) PetscFunctionReturn(0); 2756 2757 /* if a->j are the same */ 2758 PetscCall(PetscArraycmp(a->j,b->j,a->nz,flg)); 2759 if (!*flg) PetscFunctionReturn(0); 2760 2761 /* if a->a are the same */ 2762 PetscCall(PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg)); 2763 PetscFunctionReturn(0); 2764 2765 } 2766 2767 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2768 { 2769 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2770 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2771 PetscScalar *x,zero = 0.0; 2772 MatScalar *aa,*aa_j; 2773 2774 PetscFunctionBegin; 2775 PetscCheck(!A->factortype,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2776 bs = A->rmap->bs; 2777 aa = a->a; 2778 ai = a->i; 2779 aj = a->j; 2780 ambs = a->mbs; 2781 bs2 = a->bs2; 2782 2783 PetscCall(VecSet(v,zero)); 2784 PetscCall(VecGetArray(v,&x)); 2785 PetscCall(VecGetLocalSize(v,&n)); 2786 PetscCheck(n == A->rmap->N,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2787 for (i=0; i<ambs; i++) { 2788 for (j=ai[i]; j<ai[i+1]; j++) { 2789 if (aj[j] == i) { 2790 row = i*bs; 2791 aa_j = aa+j*bs2; 2792 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2793 break; 2794 } 2795 } 2796 } 2797 PetscCall(VecRestoreArray(v,&x)); 2798 PetscFunctionReturn(0); 2799 } 2800 2801 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2802 { 2803 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2804 const PetscScalar *l,*r,*li,*ri; 2805 PetscScalar x; 2806 MatScalar *aa, *v; 2807 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2808 const PetscInt *ai,*aj; 2809 2810 PetscFunctionBegin; 2811 ai = a->i; 2812 aj = a->j; 2813 aa = a->a; 2814 m = A->rmap->n; 2815 n = A->cmap->n; 2816 bs = A->rmap->bs; 2817 mbs = a->mbs; 2818 bs2 = a->bs2; 2819 if (ll) { 2820 PetscCall(VecGetArrayRead(ll,&l)); 2821 PetscCall(VecGetLocalSize(ll,&lm)); 2822 PetscCheck(lm == m,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2823 for (i=0; i<mbs; i++) { /* for each block row */ 2824 M = ai[i+1] - ai[i]; 2825 li = l + i*bs; 2826 v = aa + bs2*ai[i]; 2827 for (j=0; j<M; j++) { /* for each block */ 2828 for (k=0; k<bs2; k++) { 2829 (*v++) *= li[k%bs]; 2830 } 2831 } 2832 } 2833 PetscCall(VecRestoreArrayRead(ll,&l)); 2834 PetscCall(PetscLogFlops(a->nz)); 2835 } 2836 2837 if (rr) { 2838 PetscCall(VecGetArrayRead(rr,&r)); 2839 PetscCall(VecGetLocalSize(rr,&rn)); 2840 PetscCheck(rn == n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2841 for (i=0; i<mbs; i++) { /* for each block row */ 2842 iai = ai[i]; 2843 M = ai[i+1] - iai; 2844 v = aa + bs2*iai; 2845 for (j=0; j<M; j++) { /* for each block */ 2846 ri = r + bs*aj[iai+j]; 2847 for (k=0; k<bs; k++) { 2848 x = ri[k]; 2849 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2850 v += bs; 2851 } 2852 } 2853 } 2854 PetscCall(VecRestoreArrayRead(rr,&r)); 2855 PetscCall(PetscLogFlops(a->nz)); 2856 } 2857 PetscFunctionReturn(0); 2858 } 2859 2860 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2861 { 2862 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2863 2864 PetscFunctionBegin; 2865 info->block_size = a->bs2; 2866 info->nz_allocated = a->bs2*a->maxnz; 2867 info->nz_used = a->bs2*a->nz; 2868 info->nz_unneeded = info->nz_allocated - info->nz_used; 2869 info->assemblies = A->num_ass; 2870 info->mallocs = A->info.mallocs; 2871 info->memory = ((PetscObject)A)->mem; 2872 if (A->factortype) { 2873 info->fill_ratio_given = A->info.fill_ratio_given; 2874 info->fill_ratio_needed = A->info.fill_ratio_needed; 2875 info->factor_mallocs = A->info.factor_mallocs; 2876 } else { 2877 info->fill_ratio_given = 0; 2878 info->fill_ratio_needed = 0; 2879 info->factor_mallocs = 0; 2880 } 2881 PetscFunctionReturn(0); 2882 } 2883 2884 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2885 { 2886 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2887 2888 PetscFunctionBegin; 2889 PetscCall(PetscArrayzero(a->a,a->bs2*a->i[a->mbs])); 2890 PetscFunctionReturn(0); 2891 } 2892 2893 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 2894 { 2895 PetscFunctionBegin; 2896 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C)); 2897 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 2898 PetscFunctionReturn(0); 2899 } 2900 2901 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2902 { 2903 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2904 PetscScalar *z = NULL,sum1; 2905 const PetscScalar *xb; 2906 PetscScalar x1; 2907 const MatScalar *v,*vv; 2908 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2909 PetscBool usecprow=a->compressedrow.use; 2910 2911 PetscFunctionBegin; 2912 idx = a->j; 2913 v = a->a; 2914 if (usecprow) { 2915 mbs = a->compressedrow.nrows; 2916 ii = a->compressedrow.i; 2917 ridx = a->compressedrow.rindex; 2918 } else { 2919 mbs = a->mbs; 2920 ii = a->i; 2921 z = c; 2922 } 2923 2924 for (i=0; i<mbs; i++) { 2925 n = ii[1] - ii[0]; ii++; 2926 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2927 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2928 if (usecprow) z = c + ridx[i]; 2929 jj = idx; 2930 vv = v; 2931 for (k=0; k<cn; k++) { 2932 idx = jj; 2933 v = vv; 2934 sum1 = 0.0; 2935 for (j=0; j<n; j++) { 2936 xb = b + (*idx++); x1 = xb[0+k*bm]; 2937 sum1 += v[0]*x1; 2938 v += 1; 2939 } 2940 z[0+k*cm] = sum1; 2941 } 2942 if (!usecprow) z += 1; 2943 } 2944 PetscFunctionReturn(0); 2945 } 2946 2947 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2948 { 2949 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2950 PetscScalar *z = NULL,sum1,sum2; 2951 const PetscScalar *xb; 2952 PetscScalar x1,x2; 2953 const MatScalar *v,*vv; 2954 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2955 PetscBool usecprow=a->compressedrow.use; 2956 2957 PetscFunctionBegin; 2958 idx = a->j; 2959 v = a->a; 2960 if (usecprow) { 2961 mbs = a->compressedrow.nrows; 2962 ii = a->compressedrow.i; 2963 ridx = a->compressedrow.rindex; 2964 } else { 2965 mbs = a->mbs; 2966 ii = a->i; 2967 z = c; 2968 } 2969 2970 for (i=0; i<mbs; i++) { 2971 n = ii[1] - ii[0]; ii++; 2972 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2973 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2974 if (usecprow) z = c + 2*ridx[i]; 2975 jj = idx; 2976 vv = v; 2977 for (k=0; k<cn; k++) { 2978 idx = jj; 2979 v = vv; 2980 sum1 = 0.0; sum2 = 0.0; 2981 for (j=0; j<n; j++) { 2982 xb = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; 2983 sum1 += v[0]*x1 + v[2]*x2; 2984 sum2 += v[1]*x1 + v[3]*x2; 2985 v += 4; 2986 } 2987 z[0+k*cm] = sum1; z[1+k*cm] = sum2; 2988 } 2989 if (!usecprow) z += 2; 2990 } 2991 PetscFunctionReturn(0); 2992 } 2993 2994 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2995 { 2996 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2997 PetscScalar *z = NULL,sum1,sum2,sum3; 2998 const PetscScalar *xb; 2999 PetscScalar x1,x2,x3; 3000 const MatScalar *v,*vv; 3001 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3002 PetscBool usecprow=a->compressedrow.use; 3003 3004 PetscFunctionBegin; 3005 idx = a->j; 3006 v = a->a; 3007 if (usecprow) { 3008 mbs = a->compressedrow.nrows; 3009 ii = a->compressedrow.i; 3010 ridx = a->compressedrow.rindex; 3011 } else { 3012 mbs = a->mbs; 3013 ii = a->i; 3014 z = c; 3015 } 3016 3017 for (i=0; i<mbs; i++) { 3018 n = ii[1] - ii[0]; ii++; 3019 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3020 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3021 if (usecprow) z = c + 3*ridx[i]; 3022 jj = idx; 3023 vv = v; 3024 for (k=0; k<cn; k++) { 3025 idx = jj; 3026 v = vv; 3027 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 3028 for (j=0; j<n; j++) { 3029 xb = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; 3030 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 3031 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 3032 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 3033 v += 9; 3034 } 3035 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; 3036 } 3037 if (!usecprow) z += 3; 3038 } 3039 PetscFunctionReturn(0); 3040 } 3041 3042 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 3043 { 3044 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3045 PetscScalar *z = NULL,sum1,sum2,sum3,sum4; 3046 const PetscScalar *xb; 3047 PetscScalar x1,x2,x3,x4; 3048 const MatScalar *v,*vv; 3049 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3050 PetscBool usecprow=a->compressedrow.use; 3051 3052 PetscFunctionBegin; 3053 idx = a->j; 3054 v = a->a; 3055 if (usecprow) { 3056 mbs = a->compressedrow.nrows; 3057 ii = a->compressedrow.i; 3058 ridx = a->compressedrow.rindex; 3059 } else { 3060 mbs = a->mbs; 3061 ii = a->i; 3062 z = c; 3063 } 3064 3065 for (i=0; i<mbs; i++) { 3066 n = ii[1] - ii[0]; ii++; 3067 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3068 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3069 if (usecprow) z = c + 4*ridx[i]; 3070 jj = idx; 3071 vv = v; 3072 for (k=0; k<cn; k++) { 3073 idx = jj; 3074 v = vv; 3075 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 3076 for (j=0; j<n; j++) { 3077 xb = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; 3078 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3079 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3080 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3081 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3082 v += 16; 3083 } 3084 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; 3085 } 3086 if (!usecprow) z += 4; 3087 } 3088 PetscFunctionReturn(0); 3089 } 3090 3091 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 3092 { 3093 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3094 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5; 3095 const PetscScalar *xb; 3096 PetscScalar x1,x2,x3,x4,x5; 3097 const MatScalar *v,*vv; 3098 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3099 PetscBool usecprow=a->compressedrow.use; 3100 3101 PetscFunctionBegin; 3102 idx = a->j; 3103 v = a->a; 3104 if (usecprow) { 3105 mbs = a->compressedrow.nrows; 3106 ii = a->compressedrow.i; 3107 ridx = a->compressedrow.rindex; 3108 } else { 3109 mbs = a->mbs; 3110 ii = a->i; 3111 z = c; 3112 } 3113 3114 for (i=0; i<mbs; i++) { 3115 n = ii[1] - ii[0]; ii++; 3116 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3117 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3118 if (usecprow) z = c + 5*ridx[i]; 3119 jj = idx; 3120 vv = v; 3121 for (k=0; k<cn; k++) { 3122 idx = jj; 3123 v = vv; 3124 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 3125 for (j=0; j<n; j++) { 3126 xb = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm]; 3127 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3128 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3129 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3130 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3131 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3132 v += 25; 3133 } 3134 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5; 3135 } 3136 if (!usecprow) z += 5; 3137 } 3138 PetscFunctionReturn(0); 3139 } 3140 3141 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C) 3142 { 3143 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3144 Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 3145 Mat_SeqDense *cd = (Mat_SeqDense*)C->data; 3146 PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda; 3147 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 3148 PetscBLASInt bbs,bcn,bbm,bcm; 3149 PetscScalar *z = NULL; 3150 PetscScalar *c,*b; 3151 const MatScalar *v; 3152 const PetscInt *idx,*ii,*ridx=NULL; 3153 PetscScalar _DZero=0.0,_DOne=1.0; 3154 PetscBool usecprow=a->compressedrow.use; 3155 3156 PetscFunctionBegin; 3157 if (!cm || !cn) PetscFunctionReturn(0); 3158 PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n); 3159 PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n); 3160 PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n); 3161 b = bd->v; 3162 if (a->nonzerorowcnt != A->rmap->n) { 3163 PetscCall(MatZeroEntries(C)); 3164 } 3165 PetscCall(MatDenseGetArray(C,&c)); 3166 switch (bs) { 3167 case 1: 3168 PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); 3169 break; 3170 case 2: 3171 PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); 3172 break; 3173 case 3: 3174 PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); 3175 break; 3176 case 4: 3177 PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); 3178 break; 3179 case 5: 3180 PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); 3181 break; 3182 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 3183 PetscCall(PetscBLASIntCast(bs,&bbs)); 3184 PetscCall(PetscBLASIntCast(cn,&bcn)); 3185 PetscCall(PetscBLASIntCast(bm,&bbm)); 3186 PetscCall(PetscBLASIntCast(cm,&bcm)); 3187 idx = a->j; 3188 v = a->a; 3189 if (usecprow) { 3190 mbs = a->compressedrow.nrows; 3191 ii = a->compressedrow.i; 3192 ridx = a->compressedrow.rindex; 3193 } else { 3194 mbs = a->mbs; 3195 ii = a->i; 3196 z = c; 3197 } 3198 for (i=0; i<mbs; i++) { 3199 n = ii[1] - ii[0]; ii++; 3200 if (usecprow) z = c + bs*ridx[i]; 3201 if (n) { 3202 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm)); 3203 v += bs2; 3204 } 3205 for (j=1; j<n; j++) { 3206 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm)); 3207 v += bs2; 3208 } 3209 if (!usecprow) z += bs; 3210 } 3211 } 3212 PetscCall(MatDenseRestoreArray(C,&c)); 3213 PetscCall(PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn)); 3214 PetscFunctionReturn(0); 3215 } 3216