1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/impls/dense/seq/dense.h> 4 #include <petsc/private/kernels/blockinvert.h> 5 #include <petscbt.h> 6 #include <petscblaslapack.h> 7 8 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 9 #include <immintrin.h> 10 #endif 11 12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 15 PetscErrorCode ierr; 16 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 17 const PetscInt *idx; 18 PetscInt start,end,*ai,*aj,bs,*nidx2; 19 PetscBT table; 20 21 PetscFunctionBegin; 22 m = a->mbs; 23 ai = a->i; 24 aj = a->j; 25 bs = A->rmap->bs; 26 27 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 29 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 30 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 31 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 32 33 for (i=0; i<is_max; i++) { 34 /* Initialise the two local arrays */ 35 isz = 0; 36 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 37 38 /* Extract the indices, assume there can be duplicate entries */ 39 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 40 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 41 42 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 43 for (j=0; j<n; ++j) { 44 ival = idx[j]/bs; /* convert the indices into block indices */ 45 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 46 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++) { /* for each overlap*/ 53 n = isz; 54 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 55 row = nidx[k]; 56 start = ai[row]; 57 end = ai[row+1]; 58 for (l = start; l<end; l++) { 59 val = aj[l]; 60 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 61 } 62 } 63 } 64 /* expand the Index Set */ 65 for (j=0; j<isz; j++) { 66 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 67 } 68 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 69 } 70 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 71 ierr = PetscFree(nidx);CHKERRQ(ierr); 72 ierr = PetscFree(nidx2);CHKERRQ(ierr); 73 PetscFunctionReturn(0); 74 } 75 76 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 77 { 78 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 79 PetscErrorCode ierr; 80 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 81 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 82 const PetscInt *irow,*icol; 83 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 84 PetscInt *aj = a->j,*ai = a->i; 85 MatScalar *mat_a; 86 Mat C; 87 PetscBool flag; 88 89 PetscFunctionBegin; 90 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 91 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 92 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 93 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 94 95 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 96 ssmap = smap; 97 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 98 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 99 /* determine lens of each row */ 100 for (i=0; i<nrows; i++) { 101 kstart = ai[irow[i]]; 102 kend = kstart + a->ilen[irow[i]]; 103 lens[i] = 0; 104 for (k=kstart; k<kend; k++) { 105 if (ssmap[aj[k]]) lens[i]++; 106 } 107 } 108 /* Create and fill new matrix */ 109 if (scall == MAT_REUSE_MATRIX) { 110 c = (Mat_SeqBAIJ*)((*B)->data); 111 112 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 113 ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr); 114 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 115 ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr); 116 C = *B; 117 } else { 118 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 119 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 120 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 121 ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 122 } 123 c = (Mat_SeqBAIJ*)(C->data); 124 for (i=0; i<nrows; i++) { 125 row = irow[i]; 126 kstart = ai[row]; 127 kend = kstart + a->ilen[row]; 128 mat_i = c->i[i]; 129 mat_j = c->j + mat_i; 130 mat_a = c->a + mat_i*bs2; 131 mat_ilen = c->ilen + i; 132 for (k=kstart; k<kend; k++) { 133 if ((tcol=ssmap[a->j[k]])) { 134 *mat_j++ = tcol - 1; 135 ierr = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr); 136 mat_a += bs2; 137 (*mat_ilen)++; 138 } 139 } 140 } 141 /* sort */ 142 { 143 MatScalar *work; 144 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 145 for (i=0; i<nrows; i++) { 146 PetscInt ilen; 147 mat_i = c->i[i]; 148 mat_j = c->j + mat_i; 149 mat_a = c->a + mat_i*bs2; 150 ilen = c->ilen[i]; 151 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 152 } 153 ierr = PetscFree(work);CHKERRQ(ierr); 154 } 155 156 /* Free work space */ 157 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 158 ierr = PetscFree(smap);CHKERRQ(ierr); 159 ierr = PetscFree(lens);CHKERRQ(ierr); 160 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 161 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 162 163 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 164 *B = C; 165 PetscFunctionReturn(0); 166 } 167 168 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 169 { 170 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 171 IS is1,is2; 172 PetscErrorCode ierr; 173 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 174 const PetscInt *irow,*icol; 175 176 PetscFunctionBegin; 177 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 178 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 179 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 180 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 181 182 /* Verify if the indices corespond to each element in a block 183 and form the IS with compressed IS */ 184 maxmnbs = PetscMax(a->mbs,a->nbs); 185 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 186 ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr); 187 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 188 for (i=0; i<a->mbs; i++) { 189 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 190 } 191 count = 0; 192 for (i=0; i<nrows; i++) { 193 j = irow[i] / bs; 194 if ((vary[j]--)==bs) iary[count++] = j; 195 } 196 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 197 198 ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr); 199 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 200 for (i=0; i<a->nbs; i++) { 201 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 202 } 203 count = 0; 204 for (i=0; i<ncols; i++) { 205 j = icol[i] / bs; 206 if ((vary[j]--)==bs) iary[count++] = j; 207 } 208 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 209 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 210 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 211 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 212 213 ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 214 ierr = ISDestroy(&is1);CHKERRQ(ierr); 215 ierr = ISDestroy(&is2);CHKERRQ(ierr); 216 PetscFunctionReturn(0); 217 } 218 219 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 220 { 221 PetscErrorCode ierr; 222 Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data; 223 Mat_SubSppt *submatj = c->submatis1; 224 225 PetscFunctionBegin; 226 ierr = (*submatj->destroy)(C);CHKERRQ(ierr); 227 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 228 PetscFunctionReturn(0); 229 } 230 231 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[]) 232 { 233 PetscErrorCode ierr; 234 PetscInt i; 235 Mat C; 236 Mat_SeqBAIJ *c; 237 Mat_SubSppt *submatj; 238 239 PetscFunctionBegin; 240 for (i=0; i<n; i++) { 241 C = (*mat)[i]; 242 c = (Mat_SeqBAIJ*)C->data; 243 submatj = c->submatis1; 244 if (submatj) { 245 if (--((PetscObject)C)->refct <= 0) { 246 ierr = (*submatj->destroy)(C);CHKERRQ(ierr); 247 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 248 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 249 ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr); 250 ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr); 251 ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr); 252 } 253 } else { 254 ierr = MatDestroy(&C);CHKERRQ(ierr); 255 } 256 } 257 258 /* Destroy Dummy submatrices created for reuse */ 259 ierr = MatDestroySubMatrices_Dummy(n,mat);CHKERRQ(ierr); 260 261 ierr = PetscFree(*mat);CHKERRQ(ierr); 262 PetscFunctionReturn(0); 263 } 264 265 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 266 { 267 PetscErrorCode ierr; 268 PetscInt i; 269 270 PetscFunctionBegin; 271 if (scall == MAT_INITIAL_MATRIX) { 272 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 273 } 274 275 for (i=0; i<n; i++) { 276 ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 277 } 278 PetscFunctionReturn(0); 279 } 280 281 /* -------------------------------------------------------*/ 282 /* Should check that shapes of vectors and matrices match */ 283 /* -------------------------------------------------------*/ 284 285 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 286 { 287 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 288 PetscScalar *z,sum; 289 const PetscScalar *x; 290 const MatScalar *v; 291 PetscErrorCode ierr; 292 PetscInt mbs,i,n; 293 const PetscInt *idx,*ii,*ridx=NULL; 294 PetscBool usecprow=a->compressedrow.use; 295 296 PetscFunctionBegin; 297 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 298 ierr = VecGetArrayWrite(zz,&z);CHKERRQ(ierr); 299 300 if (usecprow) { 301 mbs = a->compressedrow.nrows; 302 ii = a->compressedrow.i; 303 ridx = a->compressedrow.rindex; 304 ierr = PetscArrayzero(z,a->mbs);CHKERRQ(ierr); 305 } else { 306 mbs = a->mbs; 307 ii = a->i; 308 } 309 310 for (i=0; i<mbs; i++) { 311 n = ii[1] - ii[0]; 312 v = a->a + ii[0]; 313 idx = a->j + ii[0]; 314 ii++; 315 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 316 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 317 sum = 0.0; 318 PetscSparseDensePlusDot(sum,x,v,idx,n); 319 if (usecprow) { 320 z[ridx[i]] = sum; 321 } else { 322 z[i] = sum; 323 } 324 } 325 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 326 ierr = VecRestoreArrayWrite(zz,&z);CHKERRQ(ierr); 327 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 328 PetscFunctionReturn(0); 329 } 330 331 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 332 { 333 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 334 PetscScalar *z = NULL,sum1,sum2,*zarray; 335 const PetscScalar *x,*xb; 336 PetscScalar x1,x2; 337 const MatScalar *v; 338 PetscErrorCode ierr; 339 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 340 PetscBool usecprow=a->compressedrow.use; 341 342 PetscFunctionBegin; 343 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 344 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 345 346 idx = a->j; 347 v = a->a; 348 if (usecprow) { 349 mbs = a->compressedrow.nrows; 350 ii = a->compressedrow.i; 351 ridx = a->compressedrow.rindex; 352 ierr = PetscArrayzero(zarray,2*a->mbs);CHKERRQ(ierr); 353 } else { 354 mbs = a->mbs; 355 ii = a->i; 356 z = zarray; 357 } 358 359 for (i=0; i<mbs; i++) { 360 n = ii[1] - ii[0]; ii++; 361 sum1 = 0.0; sum2 = 0.0; 362 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 363 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 364 for (j=0; j<n; j++) { 365 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 366 sum1 += v[0]*x1 + v[2]*x2; 367 sum2 += v[1]*x1 + v[3]*x2; 368 v += 4; 369 } 370 if (usecprow) z = zarray + 2*ridx[i]; 371 z[0] = sum1; z[1] = sum2; 372 if (!usecprow) z += 2; 373 } 374 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 375 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 376 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 377 PetscFunctionReturn(0); 378 } 379 380 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 381 { 382 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 383 PetscScalar *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray; 384 const PetscScalar *x,*xb; 385 const MatScalar *v; 386 PetscErrorCode ierr; 387 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 388 PetscBool usecprow=a->compressedrow.use; 389 390 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 391 #pragma disjoint(*v,*z,*xb) 392 #endif 393 394 PetscFunctionBegin; 395 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 396 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 397 398 idx = a->j; 399 v = a->a; 400 if (usecprow) { 401 mbs = a->compressedrow.nrows; 402 ii = a->compressedrow.i; 403 ridx = a->compressedrow.rindex; 404 ierr = PetscArrayzero(zarray,3*a->mbs);CHKERRQ(ierr); 405 } else { 406 mbs = a->mbs; 407 ii = a->i; 408 z = zarray; 409 } 410 411 for (i=0; i<mbs; i++) { 412 n = ii[1] - ii[0]; ii++; 413 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 414 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 415 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 416 for (j=0; j<n; j++) { 417 xb = x + 3*(*idx++); 418 x1 = xb[0]; 419 x2 = xb[1]; 420 x3 = xb[2]; 421 422 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 423 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 424 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 425 v += 9; 426 } 427 if (usecprow) z = zarray + 3*ridx[i]; 428 z[0] = sum1; z[1] = sum2; z[2] = sum3; 429 if (!usecprow) z += 3; 430 } 431 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 432 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 433 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 434 PetscFunctionReturn(0); 435 } 436 437 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 438 { 439 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 440 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 441 const PetscScalar *x,*xb; 442 const MatScalar *v; 443 PetscErrorCode ierr; 444 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 445 PetscBool usecprow=a->compressedrow.use; 446 447 PetscFunctionBegin; 448 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 449 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 450 451 idx = a->j; 452 v = a->a; 453 if (usecprow) { 454 mbs = a->compressedrow.nrows; 455 ii = a->compressedrow.i; 456 ridx = a->compressedrow.rindex; 457 ierr = PetscArrayzero(zarray,4*a->mbs);CHKERRQ(ierr); 458 } else { 459 mbs = a->mbs; 460 ii = a->i; 461 z = zarray; 462 } 463 464 for (i=0; i<mbs; i++) { 465 n = ii[1] - ii[0]; 466 ii++; 467 sum1 = 0.0; 468 sum2 = 0.0; 469 sum3 = 0.0; 470 sum4 = 0.0; 471 472 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 473 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 474 for (j=0; j<n; j++) { 475 xb = x + 4*(*idx++); 476 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 477 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 478 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 479 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 480 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 481 v += 16; 482 } 483 if (usecprow) z = zarray + 4*ridx[i]; 484 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 485 if (!usecprow) z += 4; 486 } 487 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 488 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 489 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 490 PetscFunctionReturn(0); 491 } 492 493 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 494 { 495 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 496 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray; 497 const PetscScalar *xb,*x; 498 const MatScalar *v; 499 PetscErrorCode ierr; 500 const PetscInt *idx,*ii,*ridx=NULL; 501 PetscInt mbs,i,j,n; 502 PetscBool usecprow=a->compressedrow.use; 503 504 PetscFunctionBegin; 505 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 506 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 507 508 idx = a->j; 509 v = a->a; 510 if (usecprow) { 511 mbs = a->compressedrow.nrows; 512 ii = a->compressedrow.i; 513 ridx = a->compressedrow.rindex; 514 ierr = PetscArrayzero(zarray,5*a->mbs);CHKERRQ(ierr); 515 } else { 516 mbs = a->mbs; 517 ii = a->i; 518 z = zarray; 519 } 520 521 for (i=0; i<mbs; i++) { 522 n = ii[1] - ii[0]; ii++; 523 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 524 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 525 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 526 for (j=0; j<n; j++) { 527 xb = x + 5*(*idx++); 528 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 529 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 530 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 531 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 532 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 533 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 534 v += 25; 535 } 536 if (usecprow) z = zarray + 5*ridx[i]; 537 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 538 if (!usecprow) z += 5; 539 } 540 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 541 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 542 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 543 PetscFunctionReturn(0); 544 } 545 546 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 547 { 548 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 549 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; 550 const PetscScalar *x,*xb; 551 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 552 const MatScalar *v; 553 PetscErrorCode ierr; 554 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 555 PetscBool usecprow=a->compressedrow.use; 556 557 PetscFunctionBegin; 558 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 559 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 560 561 idx = a->j; 562 v = a->a; 563 if (usecprow) { 564 mbs = a->compressedrow.nrows; 565 ii = a->compressedrow.i; 566 ridx = a->compressedrow.rindex; 567 ierr = PetscArrayzero(zarray,6*a->mbs);CHKERRQ(ierr); 568 } else { 569 mbs = a->mbs; 570 ii = a->i; 571 z = zarray; 572 } 573 574 for (i=0; i<mbs; i++) { 575 n = ii[1] - ii[0]; 576 ii++; 577 sum1 = 0.0; 578 sum2 = 0.0; 579 sum3 = 0.0; 580 sum4 = 0.0; 581 sum5 = 0.0; 582 sum6 = 0.0; 583 584 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 585 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 586 for (j=0; j<n; j++) { 587 xb = x + 6*(*idx++); 588 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 589 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 590 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 591 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 592 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 593 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 594 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 595 v += 36; 596 } 597 if (usecprow) z = zarray + 6*ridx[i]; 598 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 599 if (!usecprow) z += 6; 600 } 601 602 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 603 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 604 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 605 PetscFunctionReturn(0); 606 } 607 608 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 609 { 610 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 611 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 612 const PetscScalar *x,*xb; 613 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 614 const MatScalar *v; 615 PetscErrorCode ierr; 616 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 617 PetscBool usecprow=a->compressedrow.use; 618 619 PetscFunctionBegin; 620 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 621 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 622 623 idx = a->j; 624 v = a->a; 625 if (usecprow) { 626 mbs = a->compressedrow.nrows; 627 ii = a->compressedrow.i; 628 ridx = a->compressedrow.rindex; 629 ierr = PetscArrayzero(zarray,7*a->mbs);CHKERRQ(ierr); 630 } else { 631 mbs = a->mbs; 632 ii = a->i; 633 z = zarray; 634 } 635 636 for (i=0; i<mbs; i++) { 637 n = ii[1] - ii[0]; 638 ii++; 639 sum1 = 0.0; 640 sum2 = 0.0; 641 sum3 = 0.0; 642 sum4 = 0.0; 643 sum5 = 0.0; 644 sum6 = 0.0; 645 sum7 = 0.0; 646 647 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 648 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 649 for (j=0; j<n; j++) { 650 xb = x + 7*(*idx++); 651 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 652 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 653 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 654 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 655 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 656 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 657 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 658 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 659 v += 49; 660 } 661 if (usecprow) z = zarray + 7*ridx[i]; 662 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 663 if (!usecprow) z += 7; 664 } 665 666 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 667 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 668 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 669 PetscFunctionReturn(0); 670 } 671 672 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 673 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz) 674 { 675 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 676 PetscScalar *z = NULL,*work,*workt,*zarray; 677 const PetscScalar *x,*xb; 678 const MatScalar *v; 679 PetscErrorCode ierr; 680 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 681 const PetscInt *idx,*ii,*ridx=NULL; 682 PetscInt k; 683 PetscBool usecprow=a->compressedrow.use; 684 685 __m256d a0,a1,a2,a3,a4,a5; 686 __m256d w0,w1,w2,w3; 687 __m256d z0,z1,z2; 688 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 689 690 PetscFunctionBegin; 691 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 692 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 693 694 idx = a->j; 695 v = a->a; 696 if (usecprow) { 697 mbs = a->compressedrow.nrows; 698 ii = a->compressedrow.i; 699 ridx = a->compressedrow.rindex; 700 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr); 701 } else { 702 mbs = a->mbs; 703 ii = a->i; 704 z = zarray; 705 } 706 707 if (!a->mult_work) { 708 k = PetscMax(A->rmap->n,A->cmap->n); 709 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 710 } 711 712 work = a->mult_work; 713 for (i=0; i<mbs; i++) { 714 n = ii[1] - ii[0]; ii++; 715 workt = work; 716 for (j=0; j<n; j++) { 717 xb = x + bs*(*idx++); 718 for (k=0; k<bs; k++) workt[k] = xb[k]; 719 workt += bs; 720 } 721 if (usecprow) z = zarray + bs*ridx[i]; 722 723 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 724 725 for (j=0; j<n; j++) { 726 /* first column of a */ 727 w0 = _mm256_set1_pd(work[j*9 ]); 728 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 729 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 730 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 731 732 /* second column of a */ 733 w1 = _mm256_set1_pd(work[j*9+ 1]); 734 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 735 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 736 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 737 738 /* third column of a */ 739 w2 = _mm256_set1_pd(work[j*9 +2]); 740 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 741 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 742 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 743 744 /* fourth column of a */ 745 w3 = _mm256_set1_pd(work[j*9+ 3]); 746 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 747 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 748 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 749 750 /* fifth column of a */ 751 w0 = _mm256_set1_pd(work[j*9+ 4]); 752 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 753 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 754 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 755 756 /* sixth column of a */ 757 w1 = _mm256_set1_pd(work[j*9+ 5]); 758 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 759 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 760 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 761 762 /* seventh column of a */ 763 w2 = _mm256_set1_pd(work[j*9+ 6]); 764 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 765 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 766 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 767 768 /* eigth column of a */ 769 w3 = _mm256_set1_pd(work[j*9+ 7]); 770 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 771 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 772 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 773 774 /* ninth column of a */ 775 w0 = _mm256_set1_pd(work[j*9+ 8]); 776 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 777 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 778 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 779 } 780 781 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 782 783 v += n*bs2; 784 if (!usecprow) z += bs; 785 } 786 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 787 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 788 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 789 PetscFunctionReturn(0); 790 } 791 #endif 792 793 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz) 794 { 795 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 796 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 797 const PetscScalar *x,*xb; 798 PetscScalar *zarray,xv; 799 const MatScalar *v; 800 PetscErrorCode ierr; 801 const PetscInt *ii,*ij=a->j,*idx; 802 PetscInt mbs,i,j,k,n,*ridx=NULL; 803 PetscBool usecprow=a->compressedrow.use; 804 805 PetscFunctionBegin; 806 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 807 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 808 809 v = a->a; 810 if (usecprow) { 811 mbs = a->compressedrow.nrows; 812 ii = a->compressedrow.i; 813 ridx = a->compressedrow.rindex; 814 ierr = PetscArrayzero(zarray,11*a->mbs);CHKERRQ(ierr); 815 } else { 816 mbs = a->mbs; 817 ii = a->i; 818 z = zarray; 819 } 820 821 for (i=0; i<mbs; i++) { 822 n = ii[i+1] - ii[i]; 823 idx = ij + ii[i]; 824 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 825 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; 826 827 for (j=0; j<n; j++) { 828 xb = x + 11*(idx[j]); 829 830 for (k=0; k<11; k++) { 831 xv = xb[k]; 832 sum1 += v[0]*xv; 833 sum2 += v[1]*xv; 834 sum3 += v[2]*xv; 835 sum4 += v[3]*xv; 836 sum5 += v[4]*xv; 837 sum6 += v[5]*xv; 838 sum7 += v[6]*xv; 839 sum8 += v[7]*xv; 840 sum9 += v[8]*xv; 841 sum10 += v[9]*xv; 842 sum11 += v[10]*xv; 843 v += 11; 844 } 845 } 846 if (usecprow) z = zarray + 11*ridx[i]; 847 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 848 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 849 850 if (!usecprow) z += 11; 851 } 852 853 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 854 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 855 ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr); 856 PetscFunctionReturn(0); 857 } 858 859 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 860 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz) 861 { 862 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 863 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 864 const PetscScalar *x,*xb; 865 PetscScalar *zarray,xv; 866 const MatScalar *v; 867 PetscErrorCode ierr; 868 const PetscInt *ii,*ij=a->j,*idx; 869 PetscInt mbs,i,j,k,n,*ridx=NULL; 870 PetscBool usecprow=a->compressedrow.use; 871 872 PetscFunctionBegin; 873 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 874 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 875 876 v = a->a; 877 if (usecprow) { 878 mbs = a->compressedrow.nrows; 879 ii = a->compressedrow.i; 880 ridx = a->compressedrow.rindex; 881 ierr = PetscArrayzero(zarray,12*a->mbs);CHKERRQ(ierr); 882 } else { 883 mbs = a->mbs; 884 ii = a->i; 885 z = zarray; 886 } 887 888 for (i=0; i<mbs; i++) { 889 n = ii[i+1] - ii[i]; 890 idx = ij + ii[i]; 891 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 892 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; 893 894 for (j=0; j<n; j++) { 895 xb = x + 12*(idx[j]); 896 897 for (k=0; k<12; k++) { 898 xv = xb[k]; 899 sum1 += v[0]*xv; 900 sum2 += v[1]*xv; 901 sum3 += v[2]*xv; 902 sum4 += v[3]*xv; 903 sum5 += v[4]*xv; 904 sum6 += v[5]*xv; 905 sum7 += v[6]*xv; 906 sum8 += v[7]*xv; 907 sum9 += v[8]*xv; 908 sum10 += v[9]*xv; 909 sum11 += v[10]*xv; 910 sum12 += v[11]*xv; 911 v += 12; 912 } 913 } 914 if (usecprow) z = zarray + 12*ridx[i]; 915 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 916 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 917 if (!usecprow) z += 12; 918 } 919 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 920 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 921 ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr); 922 PetscFunctionReturn(0); 923 } 924 925 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz) 926 { 927 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 928 PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 929 const PetscScalar *x,*xb; 930 PetscScalar *zarray,*yarray,xv; 931 const MatScalar *v; 932 PetscErrorCode ierr; 933 const PetscInt *ii,*ij=a->j,*idx; 934 PetscInt mbs = a->mbs,i,j,k,n,*ridx=NULL; 935 PetscBool usecprow=a->compressedrow.use; 936 937 PetscFunctionBegin; 938 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 939 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 940 941 v = a->a; 942 if (usecprow) { 943 if (zz != yy) { 944 ierr = PetscArraycpy(zarray,yarray,12*mbs);CHKERRQ(ierr); 945 } 946 mbs = a->compressedrow.nrows; 947 ii = a->compressedrow.i; 948 ridx = a->compressedrow.rindex; 949 } else { 950 ii = a->i; 951 y = yarray; 952 z = zarray; 953 } 954 955 for (i=0; i<mbs; i++) { 956 n = ii[i+1] - ii[i]; 957 idx = ij + ii[i]; 958 959 if (usecprow) { 960 y = yarray + 12*ridx[i]; 961 z = zarray + 12*ridx[i]; 962 } 963 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 964 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11]; 965 966 for (j=0; j<n; j++) { 967 xb = x + 12*(idx[j]); 968 969 for (k=0; k<12; k++) { 970 xv = xb[k]; 971 sum1 += v[0]*xv; 972 sum2 += v[1]*xv; 973 sum3 += v[2]*xv; 974 sum4 += v[3]*xv; 975 sum5 += v[4]*xv; 976 sum6 += v[5]*xv; 977 sum7 += v[6]*xv; 978 sum8 += v[7]*xv; 979 sum9 += v[8]*xv; 980 sum10 += v[9]*xv; 981 sum11 += v[10]*xv; 982 sum12 += v[11]*xv; 983 v += 12; 984 } 985 } 986 987 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 988 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 989 if (!usecprow) { 990 y += 12; 991 z += 12; 992 } 993 } 994 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 995 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 996 ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr); 997 PetscFunctionReturn(0); 998 } 999 1000 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1001 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz) 1002 { 1003 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1004 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 1005 const PetscScalar *x,*xb; 1006 PetscScalar x1,x2,x3,x4,*zarray; 1007 const MatScalar *v; 1008 PetscErrorCode ierr; 1009 const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL; 1010 PetscInt mbs,i,j,n; 1011 PetscBool usecprow=a->compressedrow.use; 1012 1013 PetscFunctionBegin; 1014 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1015 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1016 1017 v = a->a; 1018 if (usecprow) { 1019 mbs = a->compressedrow.nrows; 1020 ii = a->compressedrow.i; 1021 ridx = a->compressedrow.rindex; 1022 ierr = PetscArrayzero(zarray,12*a->mbs);CHKERRQ(ierr); 1023 } else { 1024 mbs = a->mbs; 1025 ii = a->i; 1026 z = zarray; 1027 } 1028 1029 for (i=0; i<mbs; i++) { 1030 n = ii[i+1] - ii[i]; 1031 idx = ij + ii[i]; 1032 1033 sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 1034 for (j=0; j<n; j++) { 1035 xb = x + 12*(idx[j]); 1036 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1037 1038 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1039 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1040 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1041 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1042 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1043 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1044 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1045 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1046 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1047 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1048 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1049 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1050 v += 48; 1051 1052 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1053 1054 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1055 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1056 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1057 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1058 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1059 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1060 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1061 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1062 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1063 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1064 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1065 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1066 v += 48; 1067 1068 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1069 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1070 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1071 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1072 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1073 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1074 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1075 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1076 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1077 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1078 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1079 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1080 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1081 v += 48; 1082 1083 } 1084 if (usecprow) z = zarray + 12*ridx[i]; 1085 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1086 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 1087 if (!usecprow) z += 12; 1088 } 1089 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1090 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1091 ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr); 1092 PetscFunctionReturn(0); 1093 } 1094 1095 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1096 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz) 1097 { 1098 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1099 PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 1100 const PetscScalar *x,*xb; 1101 PetscScalar x1,x2,x3,x4,*zarray,*yarray; 1102 const MatScalar *v; 1103 PetscErrorCode ierr; 1104 const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL; 1105 PetscInt mbs = a->mbs,i,j,n; 1106 PetscBool usecprow=a->compressedrow.use; 1107 1108 PetscFunctionBegin; 1109 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1110 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1111 1112 v = a->a; 1113 if (usecprow) { 1114 if (zz != yy) { 1115 ierr = PetscArraycpy(zarray,yarray,12*mbs);CHKERRQ(ierr); 1116 } 1117 mbs = a->compressedrow.nrows; 1118 ii = a->compressedrow.i; 1119 ridx = a->compressedrow.rindex; 1120 } else { 1121 ii = a->i; 1122 y = yarray; 1123 z = zarray; 1124 } 1125 1126 for (i=0; i<mbs; i++) { 1127 n = ii[i+1] - ii[i]; 1128 idx = ij + ii[i]; 1129 1130 if (usecprow) { 1131 y = yarray + 12*ridx[i]; 1132 z = zarray + 12*ridx[i]; 1133 } 1134 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1135 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11]; 1136 1137 for (j=0; j<n; j++) { 1138 xb = x + 12*(idx[j]); 1139 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1140 1141 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1142 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1143 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1144 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1145 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1146 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1147 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1148 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1149 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1150 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1151 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1152 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1153 v += 48; 1154 1155 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1156 1157 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1158 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1159 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1160 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1161 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1162 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1163 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1164 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1165 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1166 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1167 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1168 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1169 v += 48; 1170 1171 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1172 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1173 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1174 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1175 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1176 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1177 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1178 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1179 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1180 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1181 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1182 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1183 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1184 v += 48; 1185 1186 } 1187 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1188 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 1189 if (!usecprow) { 1190 y += 12; 1191 z += 12; 1192 } 1193 } 1194 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1195 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1196 ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr); 1197 PetscFunctionReturn(0); 1198 } 1199 1200 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1201 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz) 1202 { 1203 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1204 PetscScalar *z = NULL,*zarray; 1205 const PetscScalar *x,*work; 1206 const MatScalar *v = a->a; 1207 PetscErrorCode ierr; 1208 PetscInt mbs,i,j,n; 1209 const PetscInt *idx = a->j,*ii,*ridx=NULL; 1210 PetscBool usecprow=a->compressedrow.use; 1211 const PetscInt bs = 12, bs2 = 144; 1212 1213 __m256d a0,a1,a2,a3,a4,a5; 1214 __m256d w0,w1,w2,w3; 1215 __m256d z0,z1,z2; 1216 1217 PetscFunctionBegin; 1218 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1219 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1220 1221 if (usecprow) { 1222 mbs = a->compressedrow.nrows; 1223 ii = a->compressedrow.i; 1224 ridx = a->compressedrow.rindex; 1225 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr); 1226 } else { 1227 mbs = a->mbs; 1228 ii = a->i; 1229 z = zarray; 1230 } 1231 1232 for (i=0; i<mbs; i++) { 1233 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 1234 1235 n = ii[1] - ii[0]; ii++; 1236 for (j=0; j<n; j++) { 1237 work = x + bs*(*idx++); 1238 1239 /* first column of a */ 1240 w0 = _mm256_set1_pd(work[0]); 1241 a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0); 1242 a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1); 1243 a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2); 1244 1245 /* second column of a */ 1246 w1 = _mm256_set1_pd(work[1]); 1247 a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0); 1248 a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1); 1249 a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2); 1250 1251 /* third column of a */ 1252 w2 = _mm256_set1_pd(work[2]); 1253 a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0); 1254 a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1); 1255 a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2); 1256 1257 /* fourth column of a */ 1258 w3 = _mm256_set1_pd(work[3]); 1259 a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0); 1260 a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1); 1261 a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2); 1262 1263 /* fifth column of a */ 1264 w0 = _mm256_set1_pd(work[4]); 1265 a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0); 1266 a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1); 1267 a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2); 1268 1269 /* sixth column of a */ 1270 w1 = _mm256_set1_pd(work[5]); 1271 a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0); 1272 a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1); 1273 a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2); 1274 1275 /* seventh column of a */ 1276 w2 = _mm256_set1_pd(work[6]); 1277 a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0); 1278 a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1); 1279 a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2); 1280 1281 /* eigth column of a */ 1282 w3 = _mm256_set1_pd(work[7]); 1283 a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0); 1284 a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1); 1285 a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2); 1286 1287 /* ninth column of a */ 1288 w0 = _mm256_set1_pd(work[8]); 1289 a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0); 1290 a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1); 1291 a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2); 1292 1293 /* tenth column of a */ 1294 w1 = _mm256_set1_pd(work[9]); 1295 a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0); 1296 a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1); 1297 a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2); 1298 1299 /* eleventh column of a */ 1300 w2 = _mm256_set1_pd(work[10]); 1301 a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0); 1302 a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1); 1303 a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2); 1304 1305 /* twelveth column of a */ 1306 w3 = _mm256_set1_pd(work[11]); 1307 a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0); 1308 a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1); 1309 a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2); 1310 1311 v += bs2; 1312 } 1313 if (usecprow) z = zarray + bs*ridx[i]; 1314 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2); 1315 if (!usecprow) z += bs; 1316 } 1317 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1318 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1319 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1320 PetscFunctionReturn(0); 1321 } 1322 #endif 1323 1324 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1325 /* Default MatMult for block size 15 */ 1326 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 1327 { 1328 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1329 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1330 const PetscScalar *x,*xb; 1331 PetscScalar *zarray,xv; 1332 const MatScalar *v; 1333 PetscErrorCode ierr; 1334 const PetscInt *ii,*ij=a->j,*idx; 1335 PetscInt mbs,i,j,k,n,*ridx=NULL; 1336 PetscBool usecprow=a->compressedrow.use; 1337 1338 PetscFunctionBegin; 1339 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1340 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1341 1342 v = a->a; 1343 if (usecprow) { 1344 mbs = a->compressedrow.nrows; 1345 ii = a->compressedrow.i; 1346 ridx = a->compressedrow.rindex; 1347 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1348 } else { 1349 mbs = a->mbs; 1350 ii = a->i; 1351 z = zarray; 1352 } 1353 1354 for (i=0; i<mbs; i++) { 1355 n = ii[i+1] - ii[i]; 1356 idx = ij + ii[i]; 1357 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1358 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1359 1360 for (j=0; j<n; j++) { 1361 xb = x + 15*(idx[j]); 1362 1363 for (k=0; k<15; k++) { 1364 xv = xb[k]; 1365 sum1 += v[0]*xv; 1366 sum2 += v[1]*xv; 1367 sum3 += v[2]*xv; 1368 sum4 += v[3]*xv; 1369 sum5 += v[4]*xv; 1370 sum6 += v[5]*xv; 1371 sum7 += v[6]*xv; 1372 sum8 += v[7]*xv; 1373 sum9 += v[8]*xv; 1374 sum10 += v[9]*xv; 1375 sum11 += v[10]*xv; 1376 sum12 += v[11]*xv; 1377 sum13 += v[12]*xv; 1378 sum14 += v[13]*xv; 1379 sum15 += v[14]*xv; 1380 v += 15; 1381 } 1382 } 1383 if (usecprow) z = zarray + 15*ridx[i]; 1384 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1385 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1386 1387 if (!usecprow) z += 15; 1388 } 1389 1390 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1391 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1392 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1393 PetscFunctionReturn(0); 1394 } 1395 1396 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1397 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 1398 { 1399 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1400 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1401 const PetscScalar *x,*xb; 1402 PetscScalar x1,x2,x3,x4,*zarray; 1403 const MatScalar *v; 1404 PetscErrorCode ierr; 1405 const PetscInt *ii,*ij=a->j,*idx; 1406 PetscInt mbs,i,j,n,*ridx=NULL; 1407 PetscBool usecprow=a->compressedrow.use; 1408 1409 PetscFunctionBegin; 1410 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1411 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1412 1413 v = a->a; 1414 if (usecprow) { 1415 mbs = a->compressedrow.nrows; 1416 ii = a->compressedrow.i; 1417 ridx = a->compressedrow.rindex; 1418 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1419 } else { 1420 mbs = a->mbs; 1421 ii = a->i; 1422 z = zarray; 1423 } 1424 1425 for (i=0; i<mbs; i++) { 1426 n = ii[i+1] - ii[i]; 1427 idx = ij + ii[i]; 1428 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1429 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1430 1431 for (j=0; j<n; j++) { 1432 xb = x + 15*(idx[j]); 1433 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1434 1435 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1436 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1437 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1438 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1439 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1440 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1441 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1442 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1443 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1444 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1445 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1446 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1447 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1448 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1449 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1450 1451 v += 60; 1452 1453 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1454 1455 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1456 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1457 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1458 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1459 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1460 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1461 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1462 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1463 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1464 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1465 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1466 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1467 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1468 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1469 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1470 v += 60; 1471 1472 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1473 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1474 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1475 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1476 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1477 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1478 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1479 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1480 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1481 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1482 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1483 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1484 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1485 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1486 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1487 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1488 v += 60; 1489 1490 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1491 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1492 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1493 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1494 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1495 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1496 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1497 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1498 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1499 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1500 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1501 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1502 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1503 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1504 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1505 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1506 v += 45; 1507 } 1508 if (usecprow) z = zarray + 15*ridx[i]; 1509 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1510 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1511 1512 if (!usecprow) z += 15; 1513 } 1514 1515 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1516 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1517 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1518 PetscFunctionReturn(0); 1519 } 1520 1521 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1522 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1523 { 1524 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1525 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1526 const PetscScalar *x,*xb; 1527 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1528 const MatScalar *v; 1529 PetscErrorCode ierr; 1530 const PetscInt *ii,*ij=a->j,*idx; 1531 PetscInt mbs,i,j,n,*ridx=NULL; 1532 PetscBool usecprow=a->compressedrow.use; 1533 1534 PetscFunctionBegin; 1535 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1536 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1537 1538 v = a->a; 1539 if (usecprow) { 1540 mbs = a->compressedrow.nrows; 1541 ii = a->compressedrow.i; 1542 ridx = a->compressedrow.rindex; 1543 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1544 } else { 1545 mbs = a->mbs; 1546 ii = a->i; 1547 z = zarray; 1548 } 1549 1550 for (i=0; i<mbs; i++) { 1551 n = ii[i+1] - ii[i]; 1552 idx = ij + ii[i]; 1553 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1554 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1555 1556 for (j=0; j<n; j++) { 1557 xb = x + 15*(idx[j]); 1558 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1559 x8 = xb[7]; 1560 1561 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1562 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1563 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1564 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1565 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1566 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1567 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1568 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1569 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1570 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1571 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1572 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1573 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1574 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1575 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1576 v += 120; 1577 1578 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1579 1580 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1581 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1582 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1583 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1584 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1585 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1586 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1587 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1588 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1589 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1590 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1591 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1592 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1593 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1594 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1595 v += 105; 1596 } 1597 if (usecprow) z = zarray + 15*ridx[i]; 1598 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1599 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1600 1601 if (!usecprow) z += 15; 1602 } 1603 1604 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1605 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1606 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1607 PetscFunctionReturn(0); 1608 } 1609 1610 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1611 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1612 { 1613 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1614 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1615 const PetscScalar *x,*xb; 1616 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1617 const MatScalar *v; 1618 PetscErrorCode ierr; 1619 const PetscInt *ii,*ij=a->j,*idx; 1620 PetscInt mbs,i,j,n,*ridx=NULL; 1621 PetscBool usecprow=a->compressedrow.use; 1622 1623 PetscFunctionBegin; 1624 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1625 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1626 1627 v = a->a; 1628 if (usecprow) { 1629 mbs = a->compressedrow.nrows; 1630 ii = a->compressedrow.i; 1631 ridx = a->compressedrow.rindex; 1632 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1633 } else { 1634 mbs = a->mbs; 1635 ii = a->i; 1636 z = zarray; 1637 } 1638 1639 for (i=0; i<mbs; i++) { 1640 n = ii[i+1] - ii[i]; 1641 idx = ij + ii[i]; 1642 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1643 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1644 1645 for (j=0; j<n; j++) { 1646 xb = x + 15*(idx[j]); 1647 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1648 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1649 1650 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1651 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1652 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1653 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1654 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1655 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1656 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1657 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1658 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1659 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1660 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1661 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1662 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1663 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1664 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1665 v += 225; 1666 } 1667 if (usecprow) z = zarray + 15*ridx[i]; 1668 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1669 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1670 1671 if (!usecprow) z += 15; 1672 } 1673 1674 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1675 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1676 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1677 PetscFunctionReturn(0); 1678 } 1679 1680 /* 1681 This will not work with MatScalar == float because it calls the BLAS 1682 */ 1683 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1684 { 1685 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1686 PetscScalar *z = NULL,*work,*workt,*zarray; 1687 const PetscScalar *x,*xb; 1688 const MatScalar *v; 1689 PetscErrorCode ierr; 1690 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1691 const PetscInt *idx,*ii,*ridx=NULL; 1692 PetscInt ncols,k; 1693 PetscBool usecprow=a->compressedrow.use; 1694 1695 PetscFunctionBegin; 1696 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1697 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1698 1699 idx = a->j; 1700 v = a->a; 1701 if (usecprow) { 1702 mbs = a->compressedrow.nrows; 1703 ii = a->compressedrow.i; 1704 ridx = a->compressedrow.rindex; 1705 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr); 1706 } else { 1707 mbs = a->mbs; 1708 ii = a->i; 1709 z = zarray; 1710 } 1711 1712 if (!a->mult_work) { 1713 k = PetscMax(A->rmap->n,A->cmap->n); 1714 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1715 } 1716 work = a->mult_work; 1717 for (i=0; i<mbs; i++) { 1718 n = ii[1] - ii[0]; ii++; 1719 ncols = n*bs; 1720 workt = work; 1721 for (j=0; j<n; j++) { 1722 xb = x + bs*(*idx++); 1723 for (k=0; k<bs; k++) workt[k] = xb[k]; 1724 workt += bs; 1725 } 1726 if (usecprow) z = zarray + bs*ridx[i]; 1727 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1728 v += n*bs2; 1729 if (!usecprow) z += bs; 1730 } 1731 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1732 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1733 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1734 PetscFunctionReturn(0); 1735 } 1736 1737 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1738 { 1739 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1740 const PetscScalar *x; 1741 PetscScalar *y,*z,sum; 1742 const MatScalar *v; 1743 PetscErrorCode ierr; 1744 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1745 const PetscInt *idx,*ii; 1746 PetscBool usecprow=a->compressedrow.use; 1747 1748 PetscFunctionBegin; 1749 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1750 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1751 1752 idx = a->j; 1753 v = a->a; 1754 if (usecprow) { 1755 if (zz != yy) { 1756 ierr = PetscArraycpy(z,y,mbs);CHKERRQ(ierr); 1757 } 1758 mbs = a->compressedrow.nrows; 1759 ii = a->compressedrow.i; 1760 ridx = a->compressedrow.rindex; 1761 } else { 1762 ii = a->i; 1763 } 1764 1765 for (i=0; i<mbs; i++) { 1766 n = ii[1] - ii[0]; 1767 ii++; 1768 if (!usecprow) { 1769 sum = y[i]; 1770 } else { 1771 sum = y[ridx[i]]; 1772 } 1773 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1774 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1775 PetscSparseDensePlusDot(sum,x,v,idx,n); 1776 v += n; 1777 idx += n; 1778 if (usecprow) { 1779 z[ridx[i]] = sum; 1780 } else { 1781 z[i] = sum; 1782 } 1783 } 1784 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1785 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1786 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1787 PetscFunctionReturn(0); 1788 } 1789 1790 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1791 { 1792 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1793 PetscScalar *y = NULL,*z = NULL,sum1,sum2; 1794 const PetscScalar *x,*xb; 1795 PetscScalar x1,x2,*yarray,*zarray; 1796 const MatScalar *v; 1797 PetscErrorCode ierr; 1798 PetscInt mbs = a->mbs,i,n,j; 1799 const PetscInt *idx,*ii,*ridx = NULL; 1800 PetscBool usecprow = a->compressedrow.use; 1801 1802 PetscFunctionBegin; 1803 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1804 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1805 1806 idx = a->j; 1807 v = a->a; 1808 if (usecprow) { 1809 if (zz != yy) { 1810 ierr = PetscArraycpy(zarray,yarray,2*mbs);CHKERRQ(ierr); 1811 } 1812 mbs = a->compressedrow.nrows; 1813 ii = a->compressedrow.i; 1814 ridx = a->compressedrow.rindex; 1815 } else { 1816 ii = a->i; 1817 y = yarray; 1818 z = zarray; 1819 } 1820 1821 for (i=0; i<mbs; i++) { 1822 n = ii[1] - ii[0]; ii++; 1823 if (usecprow) { 1824 z = zarray + 2*ridx[i]; 1825 y = yarray + 2*ridx[i]; 1826 } 1827 sum1 = y[0]; sum2 = y[1]; 1828 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1829 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1830 for (j=0; j<n; j++) { 1831 xb = x + 2*(*idx++); 1832 x1 = xb[0]; 1833 x2 = xb[1]; 1834 1835 sum1 += v[0]*x1 + v[2]*x2; 1836 sum2 += v[1]*x1 + v[3]*x2; 1837 v += 4; 1838 } 1839 z[0] = sum1; z[1] = sum2; 1840 if (!usecprow) { 1841 z += 2; y += 2; 1842 } 1843 } 1844 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1845 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1846 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1847 PetscFunctionReturn(0); 1848 } 1849 1850 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1851 { 1852 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1853 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1854 const PetscScalar *x,*xb; 1855 const MatScalar *v; 1856 PetscErrorCode ierr; 1857 PetscInt mbs = a->mbs,i,j,n; 1858 const PetscInt *idx,*ii,*ridx = NULL; 1859 PetscBool usecprow = a->compressedrow.use; 1860 1861 PetscFunctionBegin; 1862 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1863 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1864 1865 idx = a->j; 1866 v = a->a; 1867 if (usecprow) { 1868 if (zz != yy) { 1869 ierr = PetscArraycpy(zarray,yarray,3*mbs);CHKERRQ(ierr); 1870 } 1871 mbs = a->compressedrow.nrows; 1872 ii = a->compressedrow.i; 1873 ridx = a->compressedrow.rindex; 1874 } else { 1875 ii = a->i; 1876 y = yarray; 1877 z = zarray; 1878 } 1879 1880 for (i=0; i<mbs; i++) { 1881 n = ii[1] - ii[0]; ii++; 1882 if (usecprow) { 1883 z = zarray + 3*ridx[i]; 1884 y = yarray + 3*ridx[i]; 1885 } 1886 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1887 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1888 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1889 for (j=0; j<n; j++) { 1890 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1891 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1892 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1893 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1894 v += 9; 1895 } 1896 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1897 if (!usecprow) { 1898 z += 3; y += 3; 1899 } 1900 } 1901 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1902 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1903 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1904 PetscFunctionReturn(0); 1905 } 1906 1907 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1908 { 1909 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1910 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1911 const PetscScalar *x,*xb; 1912 const MatScalar *v; 1913 PetscErrorCode ierr; 1914 PetscInt mbs = a->mbs,i,j,n; 1915 const PetscInt *idx,*ii,*ridx=NULL; 1916 PetscBool usecprow=a->compressedrow.use; 1917 1918 PetscFunctionBegin; 1919 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1920 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1921 1922 idx = a->j; 1923 v = a->a; 1924 if (usecprow) { 1925 if (zz != yy) { 1926 ierr = PetscArraycpy(zarray,yarray,4*mbs);CHKERRQ(ierr); 1927 } 1928 mbs = a->compressedrow.nrows; 1929 ii = a->compressedrow.i; 1930 ridx = a->compressedrow.rindex; 1931 } else { 1932 ii = a->i; 1933 y = yarray; 1934 z = zarray; 1935 } 1936 1937 for (i=0; i<mbs; i++) { 1938 n = ii[1] - ii[0]; ii++; 1939 if (usecprow) { 1940 z = zarray + 4*ridx[i]; 1941 y = yarray + 4*ridx[i]; 1942 } 1943 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1944 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1945 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1946 for (j=0; j<n; j++) { 1947 xb = x + 4*(*idx++); 1948 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1949 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1950 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1951 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1952 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1953 v += 16; 1954 } 1955 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1956 if (!usecprow) { 1957 z += 4; y += 4; 1958 } 1959 } 1960 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1961 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1962 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1963 PetscFunctionReturn(0); 1964 } 1965 1966 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1967 { 1968 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1969 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1970 const PetscScalar *x,*xb; 1971 PetscScalar *yarray,*zarray; 1972 const MatScalar *v; 1973 PetscErrorCode ierr; 1974 PetscInt mbs = a->mbs,i,j,n; 1975 const PetscInt *idx,*ii,*ridx = NULL; 1976 PetscBool usecprow=a->compressedrow.use; 1977 1978 PetscFunctionBegin; 1979 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1980 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1981 1982 idx = a->j; 1983 v = a->a; 1984 if (usecprow) { 1985 if (zz != yy) { 1986 ierr = PetscArraycpy(zarray,yarray,5*mbs);CHKERRQ(ierr); 1987 } 1988 mbs = a->compressedrow.nrows; 1989 ii = a->compressedrow.i; 1990 ridx = a->compressedrow.rindex; 1991 } else { 1992 ii = a->i; 1993 y = yarray; 1994 z = zarray; 1995 } 1996 1997 for (i=0; i<mbs; i++) { 1998 n = ii[1] - ii[0]; ii++; 1999 if (usecprow) { 2000 z = zarray + 5*ridx[i]; 2001 y = yarray + 5*ridx[i]; 2002 } 2003 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 2004 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2005 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2006 for (j=0; j<n; j++) { 2007 xb = x + 5*(*idx++); 2008 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 2009 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2010 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2011 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2012 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2013 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2014 v += 25; 2015 } 2016 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 2017 if (!usecprow) { 2018 z += 5; y += 5; 2019 } 2020 } 2021 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2022 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2023 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 2024 PetscFunctionReturn(0); 2025 } 2026 2027 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 2028 { 2029 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2030 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; 2031 const PetscScalar *x,*xb; 2032 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 2033 const MatScalar *v; 2034 PetscErrorCode ierr; 2035 PetscInt mbs = a->mbs,i,j,n; 2036 const PetscInt *idx,*ii,*ridx=NULL; 2037 PetscBool usecprow=a->compressedrow.use; 2038 2039 PetscFunctionBegin; 2040 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2041 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2042 2043 idx = a->j; 2044 v = a->a; 2045 if (usecprow) { 2046 if (zz != yy) { 2047 ierr = PetscArraycpy(zarray,yarray,6*mbs);CHKERRQ(ierr); 2048 } 2049 mbs = a->compressedrow.nrows; 2050 ii = a->compressedrow.i; 2051 ridx = a->compressedrow.rindex; 2052 } else { 2053 ii = a->i; 2054 y = yarray; 2055 z = zarray; 2056 } 2057 2058 for (i=0; i<mbs; i++) { 2059 n = ii[1] - ii[0]; ii++; 2060 if (usecprow) { 2061 z = zarray + 6*ridx[i]; 2062 y = yarray + 6*ridx[i]; 2063 } 2064 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 2065 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2066 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2067 for (j=0; j<n; j++) { 2068 xb = x + 6*(*idx++); 2069 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 2070 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2071 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2072 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2073 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2074 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2075 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2076 v += 36; 2077 } 2078 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 2079 if (!usecprow) { 2080 z += 6; y += 6; 2081 } 2082 } 2083 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2084 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2085 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 2086 PetscFunctionReturn(0); 2087 } 2088 2089 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 2090 { 2091 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2092 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 2093 const PetscScalar *x,*xb; 2094 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 2095 const MatScalar *v; 2096 PetscErrorCode ierr; 2097 PetscInt mbs = a->mbs,i,j,n; 2098 const PetscInt *idx,*ii,*ridx = NULL; 2099 PetscBool usecprow=a->compressedrow.use; 2100 2101 PetscFunctionBegin; 2102 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2103 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2104 2105 idx = a->j; 2106 v = a->a; 2107 if (usecprow) { 2108 if (zz != yy) { 2109 ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr); 2110 } 2111 mbs = a->compressedrow.nrows; 2112 ii = a->compressedrow.i; 2113 ridx = a->compressedrow.rindex; 2114 } else { 2115 ii = a->i; 2116 y = yarray; 2117 z = zarray; 2118 } 2119 2120 for (i=0; i<mbs; i++) { 2121 n = ii[1] - ii[0]; ii++; 2122 if (usecprow) { 2123 z = zarray + 7*ridx[i]; 2124 y = yarray + 7*ridx[i]; 2125 } 2126 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2127 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2128 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2129 for (j=0; j<n; j++) { 2130 xb = x + 7*(*idx++); 2131 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 2132 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2133 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2134 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2135 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2136 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2137 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2138 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2139 v += 49; 2140 } 2141 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2142 if (!usecprow) { 2143 z += 7; y += 7; 2144 } 2145 } 2146 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2147 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2148 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 2149 PetscFunctionReturn(0); 2150 } 2151 2152 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2153 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz) 2154 { 2155 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2156 PetscScalar *z = NULL,*work,*workt,*zarray; 2157 const PetscScalar *x,*xb; 2158 const MatScalar *v; 2159 PetscErrorCode ierr; 2160 PetscInt mbs,i,j,n; 2161 PetscInt k; 2162 PetscBool usecprow=a->compressedrow.use; 2163 const PetscInt *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81; 2164 2165 __m256d a0,a1,a2,a3,a4,a5; 2166 __m256d w0,w1,w2,w3; 2167 __m256d z0,z1,z2; 2168 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 2169 2170 PetscFunctionBegin; 2171 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 2172 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2173 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 2174 2175 idx = a->j; 2176 v = a->a; 2177 if (usecprow) { 2178 mbs = a->compressedrow.nrows; 2179 ii = a->compressedrow.i; 2180 ridx = a->compressedrow.rindex; 2181 } else { 2182 mbs = a->mbs; 2183 ii = a->i; 2184 z = zarray; 2185 } 2186 2187 if (!a->mult_work) { 2188 k = PetscMax(A->rmap->n,A->cmap->n); 2189 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2190 } 2191 2192 work = a->mult_work; 2193 for (i=0; i<mbs; i++) { 2194 n = ii[1] - ii[0]; ii++; 2195 workt = work; 2196 for (j=0; j<n; j++) { 2197 xb = x + bs*(*idx++); 2198 for (k=0; k<bs; k++) workt[k] = xb[k]; 2199 workt += bs; 2200 } 2201 if (usecprow) z = zarray + bs*ridx[i]; 2202 2203 z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]); 2204 2205 for (j=0; j<n; j++) { 2206 /* first column of a */ 2207 w0 = _mm256_set1_pd(work[j*9 ]); 2208 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 2209 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 2210 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 2211 2212 /* second column of a */ 2213 w1 = _mm256_set1_pd(work[j*9+ 1]); 2214 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 2215 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 2216 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 2217 2218 /* third column of a */ 2219 w2 = _mm256_set1_pd(work[j*9+ 2]); 2220 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 2221 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 2222 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 2223 2224 /* fourth column of a */ 2225 w3 = _mm256_set1_pd(work[j*9+ 3]); 2226 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 2227 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 2228 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 2229 2230 /* fifth column of a */ 2231 w0 = _mm256_set1_pd(work[j*9+ 4]); 2232 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 2233 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 2234 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 2235 2236 /* sixth column of a */ 2237 w1 = _mm256_set1_pd(work[j*9+ 5]); 2238 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 2239 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 2240 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 2241 2242 /* seventh column of a */ 2243 w2 = _mm256_set1_pd(work[j*9+ 6]); 2244 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 2245 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 2246 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 2247 2248 /* eigth column of a */ 2249 w3 = _mm256_set1_pd(work[j*9+ 7]); 2250 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 2251 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 2252 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 2253 2254 /* ninth column of a */ 2255 w0 = _mm256_set1_pd(work[j*9+ 8]); 2256 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 2257 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 2258 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 2259 } 2260 2261 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 2262 2263 v += n*bs2; 2264 if (!usecprow) z += bs; 2265 } 2266 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2267 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2268 ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr); 2269 PetscFunctionReturn(0); 2270 } 2271 #endif 2272 2273 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz) 2274 { 2275 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2276 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 2277 const PetscScalar *x,*xb; 2278 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray; 2279 const MatScalar *v; 2280 PetscErrorCode ierr; 2281 PetscInt mbs = a->mbs,i,j,n; 2282 const PetscInt *idx,*ii,*ridx = NULL; 2283 PetscBool usecprow=a->compressedrow.use; 2284 2285 PetscFunctionBegin; 2286 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2287 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2288 2289 idx = a->j; 2290 v = a->a; 2291 if (usecprow) { 2292 if (zz != yy) { 2293 ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr); 2294 } 2295 mbs = a->compressedrow.nrows; 2296 ii = a->compressedrow.i; 2297 ridx = a->compressedrow.rindex; 2298 } else { 2299 ii = a->i; 2300 y = yarray; 2301 z = zarray; 2302 } 2303 2304 for (i=0; i<mbs; i++) { 2305 n = ii[1] - ii[0]; ii++; 2306 if (usecprow) { 2307 z = zarray + 11*ridx[i]; 2308 y = yarray + 11*ridx[i]; 2309 } 2310 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2311 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; 2312 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2313 PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2314 for (j=0; j<n; j++) { 2315 xb = x + 11*(*idx++); 2316 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; 2317 sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11; 2318 sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11; 2319 sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11; 2320 sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11; 2321 sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11; 2322 sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11; 2323 sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11; 2324 sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11; 2325 sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11; 2326 sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11; 2327 sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11; 2328 v += 121; 2329 } 2330 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2331 z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10; 2332 if (!usecprow) { 2333 z += 11; y += 11; 2334 } 2335 } 2336 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2337 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2338 ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr); 2339 PetscFunctionReturn(0); 2340 } 2341 2342 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 2343 { 2344 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2345 PetscScalar *z = NULL,*work,*workt,*zarray; 2346 const PetscScalar *x,*xb; 2347 const MatScalar *v; 2348 PetscErrorCode ierr; 2349 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 2350 PetscInt ncols,k; 2351 const PetscInt *ridx = NULL,*idx,*ii; 2352 PetscBool usecprow = a->compressedrow.use; 2353 2354 PetscFunctionBegin; 2355 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 2356 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2357 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 2358 2359 idx = a->j; 2360 v = a->a; 2361 if (usecprow) { 2362 mbs = a->compressedrow.nrows; 2363 ii = a->compressedrow.i; 2364 ridx = a->compressedrow.rindex; 2365 } else { 2366 mbs = a->mbs; 2367 ii = a->i; 2368 z = zarray; 2369 } 2370 2371 if (!a->mult_work) { 2372 k = PetscMax(A->rmap->n,A->cmap->n); 2373 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2374 } 2375 work = a->mult_work; 2376 for (i=0; i<mbs; i++) { 2377 n = ii[1] - ii[0]; ii++; 2378 ncols = n*bs; 2379 workt = work; 2380 for (j=0; j<n; j++) { 2381 xb = x + bs*(*idx++); 2382 for (k=0; k<bs; k++) workt[k] = xb[k]; 2383 workt += bs; 2384 } 2385 if (usecprow) z = zarray + bs*ridx[i]; 2386 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 2387 v += n*bs2; 2388 if (!usecprow) z += bs; 2389 } 2390 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2391 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2392 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 2393 PetscFunctionReturn(0); 2394 } 2395 2396 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2397 { 2398 PetscScalar zero = 0.0; 2399 PetscErrorCode ierr; 2400 2401 PetscFunctionBegin; 2402 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2403 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2404 PetscFunctionReturn(0); 2405 } 2406 2407 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2408 { 2409 PetscScalar zero = 0.0; 2410 PetscErrorCode ierr; 2411 2412 PetscFunctionBegin; 2413 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2414 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2415 PetscFunctionReturn(0); 2416 } 2417 2418 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2419 { 2420 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2421 PetscScalar *z,x1,x2,x3,x4,x5; 2422 const PetscScalar *x,*xb = NULL; 2423 const MatScalar *v; 2424 PetscErrorCode ierr; 2425 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 2426 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2427 Mat_CompressedRow cprow = a->compressedrow; 2428 PetscBool usecprow = cprow.use; 2429 2430 PetscFunctionBegin; 2431 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2432 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2433 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2434 2435 idx = a->j; 2436 v = a->a; 2437 if (usecprow) { 2438 mbs = cprow.nrows; 2439 ii = cprow.i; 2440 ridx = cprow.rindex; 2441 } else { 2442 mbs=a->mbs; 2443 ii = a->i; 2444 xb = x; 2445 } 2446 2447 switch (bs) { 2448 case 1: 2449 for (i=0; i<mbs; i++) { 2450 if (usecprow) xb = x + ridx[i]; 2451 x1 = xb[0]; 2452 ib = idx + ii[0]; 2453 n = ii[1] - ii[0]; ii++; 2454 for (j=0; j<n; j++) { 2455 rval = ib[j]; 2456 z[rval] += PetscConj(*v) * x1; 2457 v++; 2458 } 2459 if (!usecprow) xb++; 2460 } 2461 break; 2462 case 2: 2463 for (i=0; i<mbs; i++) { 2464 if (usecprow) xb = x + 2*ridx[i]; 2465 x1 = xb[0]; x2 = xb[1]; 2466 ib = idx + ii[0]; 2467 n = ii[1] - ii[0]; ii++; 2468 for (j=0; j<n; j++) { 2469 rval = ib[j]*2; 2470 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2471 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2472 v += 4; 2473 } 2474 if (!usecprow) xb += 2; 2475 } 2476 break; 2477 case 3: 2478 for (i=0; i<mbs; i++) { 2479 if (usecprow) xb = x + 3*ridx[i]; 2480 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2481 ib = idx + ii[0]; 2482 n = ii[1] - ii[0]; ii++; 2483 for (j=0; j<n; j++) { 2484 rval = ib[j]*3; 2485 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2486 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2487 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2488 v += 9; 2489 } 2490 if (!usecprow) xb += 3; 2491 } 2492 break; 2493 case 4: 2494 for (i=0; i<mbs; i++) { 2495 if (usecprow) xb = x + 4*ridx[i]; 2496 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2497 ib = idx + ii[0]; 2498 n = ii[1] - ii[0]; ii++; 2499 for (j=0; j<n; j++) { 2500 rval = ib[j]*4; 2501 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2502 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2503 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2504 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2505 v += 16; 2506 } 2507 if (!usecprow) xb += 4; 2508 } 2509 break; 2510 case 5: 2511 for (i=0; i<mbs; i++) { 2512 if (usecprow) xb = x + 5*ridx[i]; 2513 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2514 x4 = xb[3]; x5 = xb[4]; 2515 ib = idx + ii[0]; 2516 n = ii[1] - ii[0]; ii++; 2517 for (j=0; j<n; j++) { 2518 rval = ib[j]*5; 2519 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2520 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2521 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2522 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2523 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2524 v += 25; 2525 } 2526 if (!usecprow) xb += 5; 2527 } 2528 break; 2529 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2530 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2531 #if 0 2532 { 2533 PetscInt ncols,k,bs2=a->bs2; 2534 PetscScalar *work,*workt,zb; 2535 const PetscScalar *xtmp; 2536 if (!a->mult_work) { 2537 k = PetscMax(A->rmap->n,A->cmap->n); 2538 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2539 } 2540 work = a->mult_work; 2541 xtmp = x; 2542 for (i=0; i<mbs; i++) { 2543 n = ii[1] - ii[0]; ii++; 2544 ncols = n*bs; 2545 ierr = PetscArrayzero(work,ncols);CHKERRQ(ierr); 2546 if (usecprow) xtmp = x + bs*ridx[i]; 2547 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2548 v += n*bs2; 2549 if (!usecprow) xtmp += bs; 2550 workt = work; 2551 for (j=0; j<n; j++) { 2552 zb = z + bs*(*idx++); 2553 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2554 workt += bs; 2555 } 2556 } 2557 } 2558 #endif 2559 } 2560 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2561 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2562 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2563 PetscFunctionReturn(0); 2564 } 2565 2566 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2567 { 2568 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2569 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2570 const PetscScalar *x,*xb = NULL; 2571 const MatScalar *v; 2572 PetscErrorCode ierr; 2573 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2574 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2575 Mat_CompressedRow cprow = a->compressedrow; 2576 PetscBool usecprow=cprow.use; 2577 2578 PetscFunctionBegin; 2579 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2580 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2581 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2582 2583 idx = a->j; 2584 v = a->a; 2585 if (usecprow) { 2586 mbs = cprow.nrows; 2587 ii = cprow.i; 2588 ridx = cprow.rindex; 2589 } else { 2590 mbs=a->mbs; 2591 ii = a->i; 2592 xb = x; 2593 } 2594 2595 switch (bs) { 2596 case 1: 2597 for (i=0; i<mbs; i++) { 2598 if (usecprow) xb = x + ridx[i]; 2599 x1 = xb[0]; 2600 ib = idx + ii[0]; 2601 n = ii[1] - ii[0]; ii++; 2602 for (j=0; j<n; j++) { 2603 rval = ib[j]; 2604 z[rval] += *v * x1; 2605 v++; 2606 } 2607 if (!usecprow) xb++; 2608 } 2609 break; 2610 case 2: 2611 for (i=0; i<mbs; i++) { 2612 if (usecprow) xb = x + 2*ridx[i]; 2613 x1 = xb[0]; x2 = xb[1]; 2614 ib = idx + ii[0]; 2615 n = ii[1] - ii[0]; ii++; 2616 for (j=0; j<n; j++) { 2617 rval = ib[j]*2; 2618 z[rval++] += v[0]*x1 + v[1]*x2; 2619 z[rval++] += v[2]*x1 + v[3]*x2; 2620 v += 4; 2621 } 2622 if (!usecprow) xb += 2; 2623 } 2624 break; 2625 case 3: 2626 for (i=0; i<mbs; i++) { 2627 if (usecprow) xb = x + 3*ridx[i]; 2628 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2629 ib = idx + ii[0]; 2630 n = ii[1] - ii[0]; ii++; 2631 for (j=0; j<n; j++) { 2632 rval = ib[j]*3; 2633 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2634 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2635 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2636 v += 9; 2637 } 2638 if (!usecprow) xb += 3; 2639 } 2640 break; 2641 case 4: 2642 for (i=0; i<mbs; i++) { 2643 if (usecprow) xb = x + 4*ridx[i]; 2644 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2645 ib = idx + ii[0]; 2646 n = ii[1] - ii[0]; ii++; 2647 for (j=0; j<n; j++) { 2648 rval = ib[j]*4; 2649 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2650 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2651 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2652 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2653 v += 16; 2654 } 2655 if (!usecprow) xb += 4; 2656 } 2657 break; 2658 case 5: 2659 for (i=0; i<mbs; i++) { 2660 if (usecprow) xb = x + 5*ridx[i]; 2661 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2662 x4 = xb[3]; x5 = xb[4]; 2663 ib = idx + ii[0]; 2664 n = ii[1] - ii[0]; ii++; 2665 for (j=0; j<n; j++) { 2666 rval = ib[j]*5; 2667 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2668 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2669 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2670 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2671 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2672 v += 25; 2673 } 2674 if (!usecprow) xb += 5; 2675 } 2676 break; 2677 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2678 PetscInt ncols,k; 2679 PetscScalar *work,*workt; 2680 const PetscScalar *xtmp; 2681 if (!a->mult_work) { 2682 k = PetscMax(A->rmap->n,A->cmap->n); 2683 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2684 } 2685 work = a->mult_work; 2686 xtmp = x; 2687 for (i=0; i<mbs; i++) { 2688 n = ii[1] - ii[0]; ii++; 2689 ncols = n*bs; 2690 ierr = PetscArrayzero(work,ncols);CHKERRQ(ierr); 2691 if (usecprow) xtmp = x + bs*ridx[i]; 2692 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2693 v += n*bs2; 2694 if (!usecprow) xtmp += bs; 2695 workt = work; 2696 for (j=0; j<n; j++) { 2697 zb = z + bs*(*idx++); 2698 for (k=0; k<bs; k++) zb[k] += workt[k]; 2699 workt += bs; 2700 } 2701 } 2702 } 2703 } 2704 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2705 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2706 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2707 PetscFunctionReturn(0); 2708 } 2709 2710 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2711 { 2712 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2713 PetscInt totalnz = a->bs2*a->nz; 2714 PetscScalar oalpha = alpha; 2715 PetscErrorCode ierr; 2716 PetscBLASInt one = 1,tnz; 2717 2718 PetscFunctionBegin; 2719 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 2720 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2721 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 2722 PetscFunctionReturn(0); 2723 } 2724 2725 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2726 { 2727 PetscErrorCode ierr; 2728 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2729 MatScalar *v = a->a; 2730 PetscReal sum = 0.0; 2731 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2732 2733 PetscFunctionBegin; 2734 if (type == NORM_FROBENIUS) { 2735 #if defined(PETSC_USE_REAL___FP16) 2736 PetscBLASInt one = 1,cnt = bs2*nz; 2737 PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one)); 2738 #else 2739 for (i=0; i<bs2*nz; i++) { 2740 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2741 } 2742 #endif 2743 *norm = PetscSqrtReal(sum); 2744 ierr = PetscLogFlops(2.0*bs2*nz);CHKERRQ(ierr); 2745 } else if (type == NORM_1) { /* maximum column sum */ 2746 PetscReal *tmp; 2747 PetscInt *bcol = a->j; 2748 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 2749 for (i=0; i<nz; i++) { 2750 for (j=0; j<bs; j++) { 2751 k1 = bs*(*bcol) + j; /* column index */ 2752 for (k=0; k<bs; k++) { 2753 tmp[k1] += PetscAbsScalar(*v); v++; 2754 } 2755 } 2756 bcol++; 2757 } 2758 *norm = 0.0; 2759 for (j=0; j<A->cmap->n; j++) { 2760 if (tmp[j] > *norm) *norm = tmp[j]; 2761 } 2762 ierr = PetscFree(tmp);CHKERRQ(ierr); 2763 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2764 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2765 *norm = 0.0; 2766 for (k=0; k<bs; k++) { 2767 for (j=0; j<a->mbs; j++) { 2768 v = a->a + bs2*a->i[j] + k; 2769 sum = 0.0; 2770 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2771 for (k1=0; k1<bs; k1++) { 2772 sum += PetscAbsScalar(*v); 2773 v += bs; 2774 } 2775 } 2776 if (sum > *norm) *norm = sum; 2777 } 2778 } 2779 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2780 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2781 PetscFunctionReturn(0); 2782 } 2783 2784 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2785 { 2786 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2787 PetscErrorCode ierr; 2788 2789 PetscFunctionBegin; 2790 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2791 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2792 *flg = PETSC_FALSE; 2793 PetscFunctionReturn(0); 2794 } 2795 2796 /* if the a->i are the same */ 2797 ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr); 2798 if (!*flg) PetscFunctionReturn(0); 2799 2800 /* if a->j are the same */ 2801 ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr); 2802 if (!*flg) PetscFunctionReturn(0); 2803 2804 /* if a->a are the same */ 2805 ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);CHKERRQ(ierr); 2806 PetscFunctionReturn(0); 2807 2808 } 2809 2810 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2811 { 2812 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2813 PetscErrorCode ierr; 2814 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2815 PetscScalar *x,zero = 0.0; 2816 MatScalar *aa,*aa_j; 2817 2818 PetscFunctionBegin; 2819 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2820 bs = A->rmap->bs; 2821 aa = a->a; 2822 ai = a->i; 2823 aj = a->j; 2824 ambs = a->mbs; 2825 bs2 = a->bs2; 2826 2827 ierr = VecSet(v,zero);CHKERRQ(ierr); 2828 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2829 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2830 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2831 for (i=0; i<ambs; i++) { 2832 for (j=ai[i]; j<ai[i+1]; j++) { 2833 if (aj[j] == i) { 2834 row = i*bs; 2835 aa_j = aa+j*bs2; 2836 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2837 break; 2838 } 2839 } 2840 } 2841 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2842 PetscFunctionReturn(0); 2843 } 2844 2845 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2846 { 2847 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2848 const PetscScalar *l,*r,*li,*ri; 2849 PetscScalar x; 2850 MatScalar *aa, *v; 2851 PetscErrorCode ierr; 2852 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2853 const PetscInt *ai,*aj; 2854 2855 PetscFunctionBegin; 2856 ai = a->i; 2857 aj = a->j; 2858 aa = a->a; 2859 m = A->rmap->n; 2860 n = A->cmap->n; 2861 bs = A->rmap->bs; 2862 mbs = a->mbs; 2863 bs2 = a->bs2; 2864 if (ll) { 2865 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2866 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2867 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2868 for (i=0; i<mbs; i++) { /* for each block row */ 2869 M = ai[i+1] - ai[i]; 2870 li = l + i*bs; 2871 v = aa + bs2*ai[i]; 2872 for (j=0; j<M; j++) { /* for each block */ 2873 for (k=0; k<bs2; k++) { 2874 (*v++) *= li[k%bs]; 2875 } 2876 } 2877 } 2878 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2879 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2880 } 2881 2882 if (rr) { 2883 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2884 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2885 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2886 for (i=0; i<mbs; i++) { /* for each block row */ 2887 iai = ai[i]; 2888 M = ai[i+1] - iai; 2889 v = aa + bs2*iai; 2890 for (j=0; j<M; j++) { /* for each block */ 2891 ri = r + bs*aj[iai+j]; 2892 for (k=0; k<bs; k++) { 2893 x = ri[k]; 2894 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2895 v += bs; 2896 } 2897 } 2898 } 2899 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2900 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2901 } 2902 PetscFunctionReturn(0); 2903 } 2904 2905 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2906 { 2907 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2908 2909 PetscFunctionBegin; 2910 info->block_size = a->bs2; 2911 info->nz_allocated = a->bs2*a->maxnz; 2912 info->nz_used = a->bs2*a->nz; 2913 info->nz_unneeded = info->nz_allocated - info->nz_used; 2914 info->assemblies = A->num_ass; 2915 info->mallocs = A->info.mallocs; 2916 info->memory = ((PetscObject)A)->mem; 2917 if (A->factortype) { 2918 info->fill_ratio_given = A->info.fill_ratio_given; 2919 info->fill_ratio_needed = A->info.fill_ratio_needed; 2920 info->factor_mallocs = A->info.factor_mallocs; 2921 } else { 2922 info->fill_ratio_given = 0; 2923 info->fill_ratio_needed = 0; 2924 info->factor_mallocs = 0; 2925 } 2926 PetscFunctionReturn(0); 2927 } 2928 2929 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2930 { 2931 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2932 PetscErrorCode ierr; 2933 2934 PetscFunctionBegin; 2935 ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr); 2936 PetscFunctionReturn(0); 2937 } 2938 2939 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 2940 { 2941 PetscErrorCode ierr; 2942 2943 PetscFunctionBegin; 2944 ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 2945 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 2946 PetscFunctionReturn(0); 2947 } 2948 2949 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2950 { 2951 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2952 PetscScalar *z = NULL,sum1; 2953 const PetscScalar *xb; 2954 PetscScalar x1; 2955 const MatScalar *v,*vv; 2956 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2957 PetscBool usecprow=a->compressedrow.use; 2958 2959 PetscFunctionBegin; 2960 idx = a->j; 2961 v = a->a; 2962 if (usecprow) { 2963 mbs = a->compressedrow.nrows; 2964 ii = a->compressedrow.i; 2965 ridx = a->compressedrow.rindex; 2966 } else { 2967 mbs = a->mbs; 2968 ii = a->i; 2969 z = c; 2970 } 2971 2972 for (i=0; i<mbs; i++) { 2973 n = ii[1] - ii[0]; ii++; 2974 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2975 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2976 if (usecprow) z = c + ridx[i]; 2977 jj = idx; 2978 vv = v; 2979 for (k=0; k<cn; k++) { 2980 idx = jj; 2981 v = vv; 2982 sum1 = 0.0; 2983 for (j=0; j<n; j++) { 2984 xb = b + (*idx++); x1 = xb[0+k*bm]; 2985 sum1 += v[0]*x1; 2986 v += 1; 2987 } 2988 z[0+k*cm] = sum1; 2989 } 2990 if (!usecprow) z += 1; 2991 } 2992 PetscFunctionReturn(0); 2993 } 2994 2995 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2996 { 2997 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2998 PetscScalar *z = NULL,sum1,sum2; 2999 const PetscScalar *xb; 3000 PetscScalar x1,x2; 3001 const MatScalar *v,*vv; 3002 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3003 PetscBool usecprow=a->compressedrow.use; 3004 3005 PetscFunctionBegin; 3006 idx = a->j; 3007 v = a->a; 3008 if (usecprow) { 3009 mbs = a->compressedrow.nrows; 3010 ii = a->compressedrow.i; 3011 ridx = a->compressedrow.rindex; 3012 } else { 3013 mbs = a->mbs; 3014 ii = a->i; 3015 z = c; 3016 } 3017 3018 for (i=0; i<mbs; i++) { 3019 n = ii[1] - ii[0]; ii++; 3020 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3021 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3022 if (usecprow) z = c + 2*ridx[i]; 3023 jj = idx; 3024 vv = v; 3025 for (k=0; k<cn; k++) { 3026 idx = jj; 3027 v = vv; 3028 sum1 = 0.0; sum2 = 0.0; 3029 for (j=0; j<n; j++) { 3030 xb = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; 3031 sum1 += v[0]*x1 + v[2]*x2; 3032 sum2 += v[1]*x1 + v[3]*x2; 3033 v += 4; 3034 } 3035 z[0+k*cm] = sum1; z[1+k*cm] = sum2; 3036 } 3037 if (!usecprow) z += 2; 3038 } 3039 PetscFunctionReturn(0); 3040 } 3041 3042 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 3043 { 3044 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3045 PetscScalar *z = NULL,sum1,sum2,sum3; 3046 const PetscScalar *xb; 3047 PetscScalar x1,x2,x3; 3048 const MatScalar *v,*vv; 3049 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3050 PetscBool usecprow=a->compressedrow.use; 3051 3052 PetscFunctionBegin; 3053 idx = a->j; 3054 v = a->a; 3055 if (usecprow) { 3056 mbs = a->compressedrow.nrows; 3057 ii = a->compressedrow.i; 3058 ridx = a->compressedrow.rindex; 3059 } else { 3060 mbs = a->mbs; 3061 ii = a->i; 3062 z = c; 3063 } 3064 3065 for (i=0; i<mbs; i++) { 3066 n = ii[1] - ii[0]; ii++; 3067 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3068 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3069 if (usecprow) z = c + 3*ridx[i]; 3070 jj = idx; 3071 vv = v; 3072 for (k=0; k<cn; k++) { 3073 idx = jj; 3074 v = vv; 3075 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 3076 for (j=0; j<n; j++) { 3077 xb = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; 3078 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 3079 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 3080 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 3081 v += 9; 3082 } 3083 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; 3084 } 3085 if (!usecprow) z += 3; 3086 } 3087 PetscFunctionReturn(0); 3088 } 3089 3090 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 3091 { 3092 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3093 PetscScalar *z = NULL,sum1,sum2,sum3,sum4; 3094 const PetscScalar *xb; 3095 PetscScalar x1,x2,x3,x4; 3096 const MatScalar *v,*vv; 3097 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3098 PetscBool usecprow=a->compressedrow.use; 3099 3100 PetscFunctionBegin; 3101 idx = a->j; 3102 v = a->a; 3103 if (usecprow) { 3104 mbs = a->compressedrow.nrows; 3105 ii = a->compressedrow.i; 3106 ridx = a->compressedrow.rindex; 3107 } else { 3108 mbs = a->mbs; 3109 ii = a->i; 3110 z = c; 3111 } 3112 3113 for (i=0; i<mbs; i++) { 3114 n = ii[1] - ii[0]; ii++; 3115 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3116 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3117 if (usecprow) z = c + 4*ridx[i]; 3118 jj = idx; 3119 vv = v; 3120 for (k=0; k<cn; k++) { 3121 idx = jj; 3122 v = vv; 3123 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 3124 for (j=0; j<n; j++) { 3125 xb = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; 3126 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3127 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3128 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3129 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3130 v += 16; 3131 } 3132 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; 3133 } 3134 if (!usecprow) z += 4; 3135 } 3136 PetscFunctionReturn(0); 3137 } 3138 3139 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 3140 { 3141 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3142 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5; 3143 const PetscScalar *xb; 3144 PetscScalar x1,x2,x3,x4,x5; 3145 const MatScalar *v,*vv; 3146 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3147 PetscBool usecprow=a->compressedrow.use; 3148 3149 PetscFunctionBegin; 3150 idx = a->j; 3151 v = a->a; 3152 if (usecprow) { 3153 mbs = a->compressedrow.nrows; 3154 ii = a->compressedrow.i; 3155 ridx = a->compressedrow.rindex; 3156 } else { 3157 mbs = a->mbs; 3158 ii = a->i; 3159 z = c; 3160 } 3161 3162 for (i=0; i<mbs; i++) { 3163 n = ii[1] - ii[0]; ii++; 3164 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3165 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3166 if (usecprow) z = c + 5*ridx[i]; 3167 jj = idx; 3168 vv = v; 3169 for (k=0; k<cn; k++) { 3170 idx = jj; 3171 v = vv; 3172 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 3173 for (j=0; j<n; j++) { 3174 xb = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm]; 3175 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3176 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3177 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3178 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3179 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3180 v += 25; 3181 } 3182 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5; 3183 } 3184 if (!usecprow) z += 5; 3185 } 3186 PetscFunctionReturn(0); 3187 } 3188 3189 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C) 3190 { 3191 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3192 Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 3193 Mat_SeqDense *cd = (Mat_SeqDense*)C->data; 3194 PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda; 3195 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 3196 PetscBLASInt bbs,bcn,bbm,bcm; 3197 PetscScalar *z = NULL; 3198 PetscScalar *c,*b; 3199 const MatScalar *v; 3200 const PetscInt *idx,*ii,*ridx=NULL; 3201 PetscScalar _DZero=0.0,_DOne=1.0; 3202 PetscBool usecprow=a->compressedrow.use; 3203 PetscErrorCode ierr; 3204 3205 PetscFunctionBegin; 3206 if (!cm || !cn) PetscFunctionReturn(0); 3207 if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n); 3208 if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n); 3209 if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n); 3210 b = bd->v; 3211 if (a->nonzerorowcnt != A->rmap->n) { 3212 ierr = MatZeroEntries(C);CHKERRQ(ierr); 3213 } 3214 ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 3215 switch (bs) { 3216 case 1: 3217 ierr = MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 3218 break; 3219 case 2: 3220 ierr = MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 3221 break; 3222 case 3: 3223 ierr = MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 3224 break; 3225 case 4: 3226 ierr = MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 3227 break; 3228 case 5: 3229 ierr = MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 3230 break; 3231 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 3232 ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr); 3233 ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr); 3234 ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr); 3235 ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr); 3236 idx = a->j; 3237 v = a->a; 3238 if (usecprow) { 3239 mbs = a->compressedrow.nrows; 3240 ii = a->compressedrow.i; 3241 ridx = a->compressedrow.rindex; 3242 } else { 3243 mbs = a->mbs; 3244 ii = a->i; 3245 z = c; 3246 } 3247 for (i=0; i<mbs; i++) { 3248 n = ii[1] - ii[0]; ii++; 3249 if (usecprow) z = c + bs*ridx[i]; 3250 if (n) { 3251 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm)); 3252 v += bs2; 3253 } 3254 for (j=1; j<n; j++) { 3255 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm)); 3256 v += bs2; 3257 } 3258 if (!usecprow) z += bs; 3259 } 3260 } 3261 ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 3262 ierr = PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);CHKERRQ(ierr); 3263 PetscFunctionReturn(0); 3264 } 3265