1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <../src/mat/impls/dense/seq/dense.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 8 #include <immintrin.h> 9 #endif 10 11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 12 { 13 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 16 const PetscInt *idx; 17 PetscInt start,end,*ai,*aj,bs,*nidx2; 18 PetscBT table; 19 20 PetscFunctionBegin; 21 m = a->mbs; 22 ai = a->i; 23 aj = a->j; 24 bs = A->rmap->bs; 25 26 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 27 28 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 29 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 30 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 31 32 for (i=0; i<is_max; i++) { 33 /* Initialise the two local arrays */ 34 isz = 0; 35 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 36 37 /* Extract the indices, assume there can be duplicate entries */ 38 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 39 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 40 41 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 42 for (j=0; j<n; ++j) { 43 ival = idx[j]/bs; /* convert the indices into block indices */ 44 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 45 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++) { /* for each overlap*/ 52 n = isz; 53 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 54 row = nidx[k]; 55 start = ai[row]; 56 end = ai[row+1]; 57 for (l = start; l<end; l++) { 58 val = aj[l]; 59 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 60 } 61 } 62 } 63 /* expand the Index Set */ 64 for (j=0; j<isz; j++) { 65 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 66 } 67 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 68 } 69 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 70 ierr = PetscFree(nidx);CHKERRQ(ierr); 71 ierr = PetscFree(nidx2);CHKERRQ(ierr); 72 PetscFunctionReturn(0); 73 } 74 75 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 76 { 77 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 78 PetscErrorCode ierr; 79 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 80 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 81 const PetscInt *irow,*icol; 82 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 83 PetscInt *aj = a->j,*ai = a->i; 84 MatScalar *mat_a; 85 Mat C; 86 PetscBool flag; 87 88 PetscFunctionBegin; 89 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 90 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 91 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 92 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 93 94 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 95 ssmap = smap; 96 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 97 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 98 /* determine lens of each row */ 99 for (i=0; i<nrows; i++) { 100 kstart = ai[irow[i]]; 101 kend = kstart + a->ilen[irow[i]]; 102 lens[i] = 0; 103 for (k=kstart; k<kend; k++) { 104 if (ssmap[aj[k]]) lens[i]++; 105 } 106 } 107 /* Create and fill new matrix */ 108 if (scall == MAT_REUSE_MATRIX) { 109 c = (Mat_SeqBAIJ*)((*B)->data); 110 111 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 112 ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr); 113 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 114 ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr); 115 C = *B; 116 } else { 117 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 118 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 119 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 120 ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 121 } 122 c = (Mat_SeqBAIJ*)(C->data); 123 for (i=0; i<nrows; i++) { 124 row = irow[i]; 125 kstart = ai[row]; 126 kend = kstart + a->ilen[row]; 127 mat_i = c->i[i]; 128 mat_j = c->j + mat_i; 129 mat_a = c->a + mat_i*bs2; 130 mat_ilen = c->ilen + i; 131 for (k=kstart; k<kend; k++) { 132 if ((tcol=ssmap[a->j[k]])) { 133 *mat_j++ = tcol - 1; 134 ierr = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr); 135 mat_a += bs2; 136 (*mat_ilen)++; 137 } 138 } 139 } 140 /* sort */ 141 { 142 MatScalar *work; 143 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 144 for (i=0; i<nrows; i++) { 145 PetscInt ilen; 146 mat_i = c->i[i]; 147 mat_j = c->j + mat_i; 148 mat_a = c->a + mat_i*bs2; 149 ilen = c->ilen[i]; 150 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 151 } 152 ierr = PetscFree(work);CHKERRQ(ierr); 153 } 154 155 /* Free work space */ 156 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 157 ierr = PetscFree(smap);CHKERRQ(ierr); 158 ierr = PetscFree(lens);CHKERRQ(ierr); 159 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 160 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 161 162 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 163 *B = C; 164 PetscFunctionReturn(0); 165 } 166 167 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 168 { 169 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 170 IS is1,is2; 171 PetscErrorCode ierr; 172 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 173 const PetscInt *irow,*icol; 174 175 PetscFunctionBegin; 176 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 177 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 178 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 179 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 180 181 /* Verify if the indices corespond to each element in a block 182 and form the IS with compressed IS */ 183 maxmnbs = PetscMax(a->mbs,a->nbs); 184 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 185 ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr); 186 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 187 for (i=0; i<a->mbs; i++) { 188 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 189 } 190 count = 0; 191 for (i=0; i<nrows; i++) { 192 j = irow[i] / bs; 193 if ((vary[j]--)==bs) iary[count++] = j; 194 } 195 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 196 197 ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr); 198 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 199 for (i=0; i<a->nbs; i++) { 200 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 201 } 202 count = 0; 203 for (i=0; i<ncols; i++) { 204 j = icol[i] / bs; 205 if ((vary[j]--)==bs) iary[count++] = j; 206 } 207 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 208 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 209 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 210 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 211 212 ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 213 ierr = ISDestroy(&is1);CHKERRQ(ierr); 214 ierr = ISDestroy(&is2);CHKERRQ(ierr); 215 PetscFunctionReturn(0); 216 } 217 218 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 219 { 220 PetscErrorCode ierr; 221 Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data; 222 Mat_SubSppt *submatj = c->submatis1; 223 224 PetscFunctionBegin; 225 ierr = (*submatj->destroy)(C);CHKERRQ(ierr); 226 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[]) 231 { 232 PetscErrorCode ierr; 233 PetscInt i; 234 Mat C; 235 Mat_SeqBAIJ *c; 236 Mat_SubSppt *submatj; 237 238 PetscFunctionBegin; 239 for (i=0; i<n; i++) { 240 C = (*mat)[i]; 241 c = (Mat_SeqBAIJ*)C->data; 242 submatj = c->submatis1; 243 if (submatj) { 244 if (--((PetscObject)C)->refct <= 0) { 245 ierr = (*submatj->destroy)(C);CHKERRQ(ierr); 246 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 247 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 248 ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr); 249 ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr); 250 ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr); 251 } 252 } else { 253 ierr = MatDestroy(&C);CHKERRQ(ierr); 254 } 255 } 256 257 /* Destroy Dummy submatrices created for reuse */ 258 ierr = MatDestroySubMatrices_Dummy(n,mat);CHKERRQ(ierr); 259 260 ierr = PetscFree(*mat);CHKERRQ(ierr); 261 PetscFunctionReturn(0); 262 } 263 264 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 265 { 266 PetscErrorCode ierr; 267 PetscInt i; 268 269 PetscFunctionBegin; 270 if (scall == MAT_INITIAL_MATRIX) { 271 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 272 } 273 274 for (i=0; i<n; i++) { 275 ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 276 } 277 PetscFunctionReturn(0); 278 } 279 280 /* -------------------------------------------------------*/ 281 /* Should check that shapes of vectors and matrices match */ 282 /* -------------------------------------------------------*/ 283 284 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 285 { 286 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 287 PetscScalar *z,sum; 288 const PetscScalar *x; 289 const MatScalar *v; 290 PetscErrorCode ierr; 291 PetscInt mbs,i,n; 292 const PetscInt *idx,*ii,*ridx=NULL; 293 PetscBool usecprow=a->compressedrow.use; 294 295 PetscFunctionBegin; 296 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 297 ierr = VecGetArrayWrite(zz,&z);CHKERRQ(ierr); 298 299 if (usecprow) { 300 mbs = a->compressedrow.nrows; 301 ii = a->compressedrow.i; 302 ridx = a->compressedrow.rindex; 303 ierr = PetscArrayzero(z,a->mbs);CHKERRQ(ierr); 304 } else { 305 mbs = a->mbs; 306 ii = a->i; 307 } 308 309 for (i=0; i<mbs; i++) { 310 n = ii[1] - ii[0]; 311 v = a->a + ii[0]; 312 idx = a->j + ii[0]; 313 ii++; 314 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 315 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 316 sum = 0.0; 317 PetscSparseDensePlusDot(sum,x,v,idx,n); 318 if (usecprow) { 319 z[ridx[i]] = sum; 320 } else { 321 z[i] = sum; 322 } 323 } 324 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 325 ierr = VecRestoreArrayWrite(zz,&z);CHKERRQ(ierr); 326 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 327 PetscFunctionReturn(0); 328 } 329 330 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 331 { 332 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 333 PetscScalar *z = NULL,sum1,sum2,*zarray; 334 const PetscScalar *x,*xb; 335 PetscScalar x1,x2; 336 const MatScalar *v; 337 PetscErrorCode ierr; 338 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 339 PetscBool usecprow=a->compressedrow.use; 340 341 PetscFunctionBegin; 342 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 343 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 344 345 idx = a->j; 346 v = a->a; 347 if (usecprow) { 348 mbs = a->compressedrow.nrows; 349 ii = a->compressedrow.i; 350 ridx = a->compressedrow.rindex; 351 ierr = PetscArrayzero(zarray,2*a->mbs);CHKERRQ(ierr); 352 } else { 353 mbs = a->mbs; 354 ii = a->i; 355 z = zarray; 356 } 357 358 for (i=0; i<mbs; i++) { 359 n = ii[1] - ii[0]; ii++; 360 sum1 = 0.0; sum2 = 0.0; 361 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 362 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 363 for (j=0; j<n; j++) { 364 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 365 sum1 += v[0]*x1 + v[2]*x2; 366 sum2 += v[1]*x1 + v[3]*x2; 367 v += 4; 368 } 369 if (usecprow) z = zarray + 2*ridx[i]; 370 z[0] = sum1; z[1] = sum2; 371 if (!usecprow) z += 2; 372 } 373 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 374 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 375 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 376 PetscFunctionReturn(0); 377 } 378 379 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 380 { 381 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 382 PetscScalar *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray; 383 const PetscScalar *x,*xb; 384 const MatScalar *v; 385 PetscErrorCode ierr; 386 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 387 PetscBool usecprow=a->compressedrow.use; 388 389 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 390 #pragma disjoint(*v,*z,*xb) 391 #endif 392 393 PetscFunctionBegin; 394 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 395 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 396 397 idx = a->j; 398 v = a->a; 399 if (usecprow) { 400 mbs = a->compressedrow.nrows; 401 ii = a->compressedrow.i; 402 ridx = a->compressedrow.rindex; 403 ierr = PetscArrayzero(zarray,3*a->mbs);CHKERRQ(ierr); 404 } else { 405 mbs = a->mbs; 406 ii = a->i; 407 z = zarray; 408 } 409 410 for (i=0; i<mbs; i++) { 411 n = ii[1] - ii[0]; ii++; 412 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 413 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 414 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 415 for (j=0; j<n; j++) { 416 xb = x + 3*(*idx++); 417 x1 = xb[0]; 418 x2 = xb[1]; 419 x3 = xb[2]; 420 421 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 422 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 423 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 424 v += 9; 425 } 426 if (usecprow) z = zarray + 3*ridx[i]; 427 z[0] = sum1; z[1] = sum2; z[2] = sum3; 428 if (!usecprow) z += 3; 429 } 430 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 431 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 432 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 433 PetscFunctionReturn(0); 434 } 435 436 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 437 { 438 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 439 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 440 const PetscScalar *x,*xb; 441 const MatScalar *v; 442 PetscErrorCode ierr; 443 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 444 PetscBool usecprow=a->compressedrow.use; 445 446 PetscFunctionBegin; 447 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 448 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 449 450 idx = a->j; 451 v = a->a; 452 if (usecprow) { 453 mbs = a->compressedrow.nrows; 454 ii = a->compressedrow.i; 455 ridx = a->compressedrow.rindex; 456 ierr = PetscArrayzero(zarray,4*a->mbs);CHKERRQ(ierr); 457 } else { 458 mbs = a->mbs; 459 ii = a->i; 460 z = zarray; 461 } 462 463 for (i=0; i<mbs; i++) { 464 n = ii[1] - ii[0]; 465 ii++; 466 sum1 = 0.0; 467 sum2 = 0.0; 468 sum3 = 0.0; 469 sum4 = 0.0; 470 471 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 472 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 473 for (j=0; j<n; j++) { 474 xb = x + 4*(*idx++); 475 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 476 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 477 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 478 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 479 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 480 v += 16; 481 } 482 if (usecprow) z = zarray + 4*ridx[i]; 483 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 484 if (!usecprow) z += 4; 485 } 486 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 487 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 488 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 489 PetscFunctionReturn(0); 490 } 491 492 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 493 { 494 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 495 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray; 496 const PetscScalar *xb,*x; 497 const MatScalar *v; 498 PetscErrorCode ierr; 499 const PetscInt *idx,*ii,*ridx=NULL; 500 PetscInt mbs,i,j,n; 501 PetscBool usecprow=a->compressedrow.use; 502 503 PetscFunctionBegin; 504 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 505 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 506 507 idx = a->j; 508 v = a->a; 509 if (usecprow) { 510 mbs = a->compressedrow.nrows; 511 ii = a->compressedrow.i; 512 ridx = a->compressedrow.rindex; 513 ierr = PetscArrayzero(zarray,5*a->mbs);CHKERRQ(ierr); 514 } else { 515 mbs = a->mbs; 516 ii = a->i; 517 z = zarray; 518 } 519 520 for (i=0; i<mbs; i++) { 521 n = ii[1] - ii[0]; ii++; 522 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 523 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 524 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 525 for (j=0; j<n; j++) { 526 xb = x + 5*(*idx++); 527 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 528 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 529 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 530 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 531 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 532 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 533 v += 25; 534 } 535 if (usecprow) z = zarray + 5*ridx[i]; 536 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 537 if (!usecprow) z += 5; 538 } 539 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 540 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 541 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 542 PetscFunctionReturn(0); 543 } 544 545 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 546 { 547 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 548 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; 549 const PetscScalar *x,*xb; 550 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 551 const MatScalar *v; 552 PetscErrorCode ierr; 553 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 554 PetscBool usecprow=a->compressedrow.use; 555 556 PetscFunctionBegin; 557 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 558 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 559 560 idx = a->j; 561 v = a->a; 562 if (usecprow) { 563 mbs = a->compressedrow.nrows; 564 ii = a->compressedrow.i; 565 ridx = a->compressedrow.rindex; 566 ierr = PetscArrayzero(zarray,6*a->mbs);CHKERRQ(ierr); 567 } else { 568 mbs = a->mbs; 569 ii = a->i; 570 z = zarray; 571 } 572 573 for (i=0; i<mbs; i++) { 574 n = ii[1] - ii[0]; 575 ii++; 576 sum1 = 0.0; 577 sum2 = 0.0; 578 sum3 = 0.0; 579 sum4 = 0.0; 580 sum5 = 0.0; 581 sum6 = 0.0; 582 583 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 584 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 585 for (j=0; j<n; j++) { 586 xb = x + 6*(*idx++); 587 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 588 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 589 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 590 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 591 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 592 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 593 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 594 v += 36; 595 } 596 if (usecprow) z = zarray + 6*ridx[i]; 597 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 598 if (!usecprow) z += 6; 599 } 600 601 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 602 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 603 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 604 PetscFunctionReturn(0); 605 } 606 607 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 608 { 609 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 610 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 611 const PetscScalar *x,*xb; 612 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 613 const MatScalar *v; 614 PetscErrorCode ierr; 615 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 616 PetscBool usecprow=a->compressedrow.use; 617 618 PetscFunctionBegin; 619 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 620 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 621 622 idx = a->j; 623 v = a->a; 624 if (usecprow) { 625 mbs = a->compressedrow.nrows; 626 ii = a->compressedrow.i; 627 ridx = a->compressedrow.rindex; 628 ierr = PetscArrayzero(zarray,7*a->mbs);CHKERRQ(ierr); 629 } else { 630 mbs = a->mbs; 631 ii = a->i; 632 z = zarray; 633 } 634 635 for (i=0; i<mbs; i++) { 636 n = ii[1] - ii[0]; 637 ii++; 638 sum1 = 0.0; 639 sum2 = 0.0; 640 sum3 = 0.0; 641 sum4 = 0.0; 642 sum5 = 0.0; 643 sum6 = 0.0; 644 sum7 = 0.0; 645 646 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 647 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 648 for (j=0; j<n; j++) { 649 xb = x + 7*(*idx++); 650 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 651 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 652 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 653 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 654 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 655 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 656 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 657 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 658 v += 49; 659 } 660 if (usecprow) z = zarray + 7*ridx[i]; 661 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 662 if (!usecprow) z += 7; 663 } 664 665 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 666 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 667 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 668 PetscFunctionReturn(0); 669 } 670 671 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 672 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz) 673 { 674 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 675 PetscScalar *z = NULL,*work,*workt,*zarray; 676 const PetscScalar *x,*xb; 677 const MatScalar *v; 678 PetscErrorCode ierr; 679 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 680 const PetscInt *idx,*ii,*ridx=NULL; 681 PetscInt k; 682 PetscBool usecprow=a->compressedrow.use; 683 684 __m256d a0,a1,a2,a3,a4,a5; 685 __m256d w0,w1,w2,w3; 686 __m256d z0,z1,z2; 687 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 688 689 PetscFunctionBegin; 690 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 691 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 692 693 idx = a->j; 694 v = a->a; 695 if (usecprow) { 696 mbs = a->compressedrow.nrows; 697 ii = a->compressedrow.i; 698 ridx = a->compressedrow.rindex; 699 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr); 700 } else { 701 mbs = a->mbs; 702 ii = a->i; 703 z = zarray; 704 } 705 706 if (!a->mult_work) { 707 k = PetscMax(A->rmap->n,A->cmap->n); 708 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 709 } 710 711 work = a->mult_work; 712 for (i=0; i<mbs; i++) { 713 n = ii[1] - ii[0]; ii++; 714 workt = work; 715 for (j=0; j<n; j++) { 716 xb = x + bs*(*idx++); 717 for (k=0; k<bs; k++) workt[k] = xb[k]; 718 workt += bs; 719 } 720 if (usecprow) z = zarray + bs*ridx[i]; 721 722 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 723 724 for (j=0; j<n; j++) { 725 /* first column of a */ 726 w0 = _mm256_set1_pd(work[j*9 ]); 727 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 728 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 729 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 730 731 /* second column of a */ 732 w1 = _mm256_set1_pd(work[j*9+ 1]); 733 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 734 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 735 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 736 737 /* third column of a */ 738 w2 = _mm256_set1_pd(work[j*9 +2]); 739 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 740 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 741 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 742 743 /* fourth column of a */ 744 w3 = _mm256_set1_pd(work[j*9+ 3]); 745 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 746 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 747 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 748 749 /* fifth column of a */ 750 w0 = _mm256_set1_pd(work[j*9+ 4]); 751 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 752 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 753 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 754 755 /* sixth column of a */ 756 w1 = _mm256_set1_pd(work[j*9+ 5]); 757 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 758 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 759 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 760 761 /* seventh column of a */ 762 w2 = _mm256_set1_pd(work[j*9+ 6]); 763 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 764 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 765 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 766 767 /* eigth column of a */ 768 w3 = _mm256_set1_pd(work[j*9+ 7]); 769 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 770 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 771 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 772 773 /* ninth column of a */ 774 w0 = _mm256_set1_pd(work[j*9+ 8]); 775 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 776 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 777 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 778 } 779 780 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 781 782 v += n*bs2; 783 if (!usecprow) z += bs; 784 } 785 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 786 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 787 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 788 PetscFunctionReturn(0); 789 } 790 #endif 791 792 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz) 793 { 794 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 795 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 796 const PetscScalar *x,*xb; 797 PetscScalar *zarray,xv; 798 const MatScalar *v; 799 PetscErrorCode ierr; 800 const PetscInt *ii,*ij=a->j,*idx; 801 PetscInt mbs,i,j,k,n,*ridx=NULL; 802 PetscBool usecprow=a->compressedrow.use; 803 804 PetscFunctionBegin; 805 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 806 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 807 808 v = a->a; 809 if (usecprow) { 810 mbs = a->compressedrow.nrows; 811 ii = a->compressedrow.i; 812 ridx = a->compressedrow.rindex; 813 ierr = PetscArrayzero(zarray,11*a->mbs);CHKERRQ(ierr); 814 } else { 815 mbs = a->mbs; 816 ii = a->i; 817 z = zarray; 818 } 819 820 for (i=0; i<mbs; i++) { 821 n = ii[i+1] - ii[i]; 822 idx = ij + ii[i]; 823 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 824 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; 825 826 for (j=0; j<n; j++) { 827 xb = x + 11*(idx[j]); 828 829 for (k=0; k<11; k++) { 830 xv = xb[k]; 831 sum1 += v[0]*xv; 832 sum2 += v[1]*xv; 833 sum3 += v[2]*xv; 834 sum4 += v[3]*xv; 835 sum5 += v[4]*xv; 836 sum6 += v[5]*xv; 837 sum7 += v[6]*xv; 838 sum8 += v[7]*xv; 839 sum9 += v[8]*xv; 840 sum10 += v[9]*xv; 841 sum11 += v[10]*xv; 842 v += 11; 843 } 844 } 845 if (usecprow) z = zarray + 11*ridx[i]; 846 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 847 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 848 849 if (!usecprow) z += 11; 850 } 851 852 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 853 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 854 ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr); 855 PetscFunctionReturn(0); 856 } 857 858 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 859 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz) 860 { 861 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 862 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 863 const PetscScalar *x,*xb; 864 PetscScalar *zarray,xv; 865 const MatScalar *v; 866 PetscErrorCode ierr; 867 const PetscInt *ii,*ij=a->j,*idx; 868 PetscInt mbs,i,j,k,n,*ridx=NULL; 869 PetscBool usecprow=a->compressedrow.use; 870 871 PetscFunctionBegin; 872 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 873 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 874 875 v = a->a; 876 if (usecprow) { 877 mbs = a->compressedrow.nrows; 878 ii = a->compressedrow.i; 879 ridx = a->compressedrow.rindex; 880 ierr = PetscArrayzero(zarray,12*a->mbs);CHKERRQ(ierr); 881 } else { 882 mbs = a->mbs; 883 ii = a->i; 884 z = zarray; 885 } 886 887 for (i=0; i<mbs; i++) { 888 n = ii[i+1] - ii[i]; 889 idx = ij + ii[i]; 890 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 891 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; 892 893 for (j=0; j<n; j++) { 894 xb = x + 12*(idx[j]); 895 896 for (k=0; k<12; k++) { 897 xv = xb[k]; 898 sum1 += v[0]*xv; 899 sum2 += v[1]*xv; 900 sum3 += v[2]*xv; 901 sum4 += v[3]*xv; 902 sum5 += v[4]*xv; 903 sum6 += v[5]*xv; 904 sum7 += v[6]*xv; 905 sum8 += v[7]*xv; 906 sum9 += v[8]*xv; 907 sum10 += v[9]*xv; 908 sum11 += v[10]*xv; 909 sum12 += v[11]*xv; 910 v += 12; 911 } 912 } 913 if (usecprow) z = zarray + 12*ridx[i]; 914 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 915 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 916 if (!usecprow) z += 12; 917 } 918 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 919 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 920 ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr); 921 PetscFunctionReturn(0); 922 } 923 924 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz) 925 { 926 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 927 PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 928 const PetscScalar *x,*xb; 929 PetscScalar *zarray,*yarray,xv; 930 const MatScalar *v; 931 PetscErrorCode ierr; 932 const PetscInt *ii,*ij=a->j,*idx; 933 PetscInt mbs = a->mbs,i,j,k,n,*ridx=NULL; 934 PetscBool usecprow=a->compressedrow.use; 935 936 PetscFunctionBegin; 937 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 938 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 939 940 v = a->a; 941 if (usecprow) { 942 if (zz != yy) { 943 ierr = PetscArraycpy(zarray,yarray,12*mbs);CHKERRQ(ierr); 944 } 945 mbs = a->compressedrow.nrows; 946 ii = a->compressedrow.i; 947 ridx = a->compressedrow.rindex; 948 } else { 949 ii = a->i; 950 y = yarray; 951 z = zarray; 952 } 953 954 for (i=0; i<mbs; i++) { 955 n = ii[i+1] - ii[i]; 956 idx = ij + ii[i]; 957 958 if (usecprow) { 959 y = yarray + 12*ridx[i]; 960 z = zarray + 12*ridx[i]; 961 } 962 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 963 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11]; 964 965 for (j=0; j<n; j++) { 966 xb = x + 12*(idx[j]); 967 968 for (k=0; k<12; k++) { 969 xv = xb[k]; 970 sum1 += v[0]*xv; 971 sum2 += v[1]*xv; 972 sum3 += v[2]*xv; 973 sum4 += v[3]*xv; 974 sum5 += v[4]*xv; 975 sum6 += v[5]*xv; 976 sum7 += v[6]*xv; 977 sum8 += v[7]*xv; 978 sum9 += v[8]*xv; 979 sum10 += v[9]*xv; 980 sum11 += v[10]*xv; 981 sum12 += v[11]*xv; 982 v += 12; 983 } 984 } 985 986 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 987 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 988 if (!usecprow) { 989 y += 12; 990 z += 12; 991 } 992 } 993 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 994 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 995 ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr); 996 PetscFunctionReturn(0); 997 } 998 999 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1000 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz) 1001 { 1002 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1003 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 1004 const PetscScalar *x,*xb; 1005 PetscScalar x1,x2,x3,x4,*zarray; 1006 const MatScalar *v; 1007 PetscErrorCode ierr; 1008 const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL; 1009 PetscInt mbs,i,j,n; 1010 PetscBool usecprow=a->compressedrow.use; 1011 1012 PetscFunctionBegin; 1013 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1014 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1015 1016 v = a->a; 1017 if (usecprow) { 1018 mbs = a->compressedrow.nrows; 1019 ii = a->compressedrow.i; 1020 ridx = a->compressedrow.rindex; 1021 ierr = PetscArrayzero(zarray,12*a->mbs);CHKERRQ(ierr); 1022 } else { 1023 mbs = a->mbs; 1024 ii = a->i; 1025 z = zarray; 1026 } 1027 1028 for (i=0; i<mbs; i++) { 1029 n = ii[i+1] - ii[i]; 1030 idx = ij + ii[i]; 1031 1032 sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 1033 for (j=0; j<n; j++) { 1034 xb = x + 12*(idx[j]); 1035 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1036 1037 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1038 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1039 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1040 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1041 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1042 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1043 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1044 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1045 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1046 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1047 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1048 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1049 v += 48; 1050 1051 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1052 1053 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1054 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1055 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1056 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1057 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1058 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1059 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1060 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1061 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1062 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1063 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1064 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1065 v += 48; 1066 1067 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1068 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1069 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1070 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1071 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1072 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1073 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1074 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1075 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1076 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1077 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1078 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1079 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1080 v += 48; 1081 1082 } 1083 if (usecprow) z = zarray + 12*ridx[i]; 1084 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1085 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 1086 if (!usecprow) z += 12; 1087 } 1088 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1089 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1090 ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr); 1091 PetscFunctionReturn(0); 1092 } 1093 1094 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1095 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz) 1096 { 1097 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1098 PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 1099 const PetscScalar *x,*xb; 1100 PetscScalar x1,x2,x3,x4,*zarray,*yarray; 1101 const MatScalar *v; 1102 PetscErrorCode ierr; 1103 const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL; 1104 PetscInt mbs = a->mbs,i,j,n; 1105 PetscBool usecprow=a->compressedrow.use; 1106 1107 PetscFunctionBegin; 1108 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1109 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1110 1111 v = a->a; 1112 if (usecprow) { 1113 if (zz != yy) { 1114 ierr = PetscArraycpy(zarray,yarray,12*mbs);CHKERRQ(ierr); 1115 } 1116 mbs = a->compressedrow.nrows; 1117 ii = a->compressedrow.i; 1118 ridx = a->compressedrow.rindex; 1119 } else { 1120 ii = a->i; 1121 y = yarray; 1122 z = zarray; 1123 } 1124 1125 for (i=0; i<mbs; i++) { 1126 n = ii[i+1] - ii[i]; 1127 idx = ij + ii[i]; 1128 1129 if (usecprow) { 1130 y = yarray + 12*ridx[i]; 1131 z = zarray + 12*ridx[i]; 1132 } 1133 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1134 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11]; 1135 1136 for (j=0; j<n; j++) { 1137 xb = x + 12*(idx[j]); 1138 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1139 1140 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1141 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1142 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1143 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1144 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1145 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1146 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1147 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1148 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1149 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1150 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1151 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1152 v += 48; 1153 1154 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1155 1156 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1157 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1158 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1159 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1160 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1161 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1162 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1163 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1164 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1165 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1166 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1167 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1168 v += 48; 1169 1170 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1171 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1172 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1173 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1174 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1175 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1176 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1177 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1178 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1179 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1180 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1181 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1182 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1183 v += 48; 1184 1185 } 1186 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1187 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 1188 if (!usecprow) { 1189 y += 12; 1190 z += 12; 1191 } 1192 } 1193 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1194 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1195 ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr); 1196 PetscFunctionReturn(0); 1197 } 1198 1199 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1200 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz) 1201 { 1202 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1203 PetscScalar *z = NULL,*zarray; 1204 const PetscScalar *x,*work; 1205 const MatScalar *v = a->a; 1206 PetscErrorCode ierr; 1207 PetscInt mbs,i,j,n; 1208 const PetscInt *idx = a->j,*ii,*ridx=NULL; 1209 PetscBool usecprow=a->compressedrow.use; 1210 const PetscInt bs = 12, bs2 = 144; 1211 1212 __m256d a0,a1,a2,a3,a4,a5; 1213 __m256d w0,w1,w2,w3; 1214 __m256d z0,z1,z2; 1215 1216 PetscFunctionBegin; 1217 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1218 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1219 1220 if (usecprow) { 1221 mbs = a->compressedrow.nrows; 1222 ii = a->compressedrow.i; 1223 ridx = a->compressedrow.rindex; 1224 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr); 1225 } else { 1226 mbs = a->mbs; 1227 ii = a->i; 1228 z = zarray; 1229 } 1230 1231 for (i=0; i<mbs; i++) { 1232 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 1233 1234 n = ii[1] - ii[0]; ii++; 1235 for (j=0; j<n; j++) { 1236 work = x + bs*(*idx++); 1237 1238 /* first column of a */ 1239 w0 = _mm256_set1_pd(work[0]); 1240 a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0); 1241 a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1); 1242 a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2); 1243 1244 /* second column of a */ 1245 w1 = _mm256_set1_pd(work[1]); 1246 a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0); 1247 a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1); 1248 a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2); 1249 1250 /* third column of a */ 1251 w2 = _mm256_set1_pd(work[2]); 1252 a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0); 1253 a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1); 1254 a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2); 1255 1256 /* fourth column of a */ 1257 w3 = _mm256_set1_pd(work[3]); 1258 a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0); 1259 a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1); 1260 a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2); 1261 1262 /* fifth column of a */ 1263 w0 = _mm256_set1_pd(work[4]); 1264 a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0); 1265 a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1); 1266 a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2); 1267 1268 /* sixth column of a */ 1269 w1 = _mm256_set1_pd(work[5]); 1270 a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0); 1271 a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1); 1272 a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2); 1273 1274 /* seventh column of a */ 1275 w2 = _mm256_set1_pd(work[6]); 1276 a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0); 1277 a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1); 1278 a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2); 1279 1280 /* eigth column of a */ 1281 w3 = _mm256_set1_pd(work[7]); 1282 a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0); 1283 a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1); 1284 a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2); 1285 1286 /* ninth column of a */ 1287 w0 = _mm256_set1_pd(work[8]); 1288 a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0); 1289 a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1); 1290 a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2); 1291 1292 /* tenth column of a */ 1293 w1 = _mm256_set1_pd(work[9]); 1294 a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0); 1295 a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1); 1296 a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2); 1297 1298 /* eleventh column of a */ 1299 w2 = _mm256_set1_pd(work[10]); 1300 a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0); 1301 a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1); 1302 a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2); 1303 1304 /* twelveth column of a */ 1305 w3 = _mm256_set1_pd(work[11]); 1306 a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0); 1307 a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1); 1308 a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2); 1309 1310 v += bs2; 1311 } 1312 if (usecprow) z = zarray + bs*ridx[i]; 1313 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2); 1314 if (!usecprow) z += bs; 1315 } 1316 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1317 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1318 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1319 PetscFunctionReturn(0); 1320 } 1321 #endif 1322 1323 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1324 /* Default MatMult for block size 15 */ 1325 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 1326 { 1327 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1328 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1329 const PetscScalar *x,*xb; 1330 PetscScalar *zarray,xv; 1331 const MatScalar *v; 1332 PetscErrorCode ierr; 1333 const PetscInt *ii,*ij=a->j,*idx; 1334 PetscInt mbs,i,j,k,n,*ridx=NULL; 1335 PetscBool usecprow=a->compressedrow.use; 1336 1337 PetscFunctionBegin; 1338 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1339 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1340 1341 v = a->a; 1342 if (usecprow) { 1343 mbs = a->compressedrow.nrows; 1344 ii = a->compressedrow.i; 1345 ridx = a->compressedrow.rindex; 1346 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1347 } else { 1348 mbs = a->mbs; 1349 ii = a->i; 1350 z = zarray; 1351 } 1352 1353 for (i=0; i<mbs; i++) { 1354 n = ii[i+1] - ii[i]; 1355 idx = ij + ii[i]; 1356 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1357 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1358 1359 for (j=0; j<n; j++) { 1360 xb = x + 15*(idx[j]); 1361 1362 for (k=0; k<15; k++) { 1363 xv = xb[k]; 1364 sum1 += v[0]*xv; 1365 sum2 += v[1]*xv; 1366 sum3 += v[2]*xv; 1367 sum4 += v[3]*xv; 1368 sum5 += v[4]*xv; 1369 sum6 += v[5]*xv; 1370 sum7 += v[6]*xv; 1371 sum8 += v[7]*xv; 1372 sum9 += v[8]*xv; 1373 sum10 += v[9]*xv; 1374 sum11 += v[10]*xv; 1375 sum12 += v[11]*xv; 1376 sum13 += v[12]*xv; 1377 sum14 += v[13]*xv; 1378 sum15 += v[14]*xv; 1379 v += 15; 1380 } 1381 } 1382 if (usecprow) z = zarray + 15*ridx[i]; 1383 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1384 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1385 1386 if (!usecprow) z += 15; 1387 } 1388 1389 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1390 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1391 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1392 PetscFunctionReturn(0); 1393 } 1394 1395 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1396 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 1397 { 1398 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1399 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1400 const PetscScalar *x,*xb; 1401 PetscScalar x1,x2,x3,x4,*zarray; 1402 const MatScalar *v; 1403 PetscErrorCode ierr; 1404 const PetscInt *ii,*ij=a->j,*idx; 1405 PetscInt mbs,i,j,n,*ridx=NULL; 1406 PetscBool usecprow=a->compressedrow.use; 1407 1408 PetscFunctionBegin; 1409 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1410 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1411 1412 v = a->a; 1413 if (usecprow) { 1414 mbs = a->compressedrow.nrows; 1415 ii = a->compressedrow.i; 1416 ridx = a->compressedrow.rindex; 1417 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1418 } else { 1419 mbs = a->mbs; 1420 ii = a->i; 1421 z = zarray; 1422 } 1423 1424 for (i=0; i<mbs; i++) { 1425 n = ii[i+1] - ii[i]; 1426 idx = ij + ii[i]; 1427 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1428 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1429 1430 for (j=0; j<n; j++) { 1431 xb = x + 15*(idx[j]); 1432 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1433 1434 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1435 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1436 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1437 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1438 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1439 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1440 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1441 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1442 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1443 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1444 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1445 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1446 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1447 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1448 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1449 1450 v += 60; 1451 1452 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1453 1454 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1455 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1456 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1457 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1458 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1459 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1460 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1461 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1462 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1463 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1464 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1465 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1466 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1467 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1468 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1469 v += 60; 1470 1471 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1472 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1473 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1474 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1475 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1476 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1477 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1478 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1479 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1480 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1481 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1482 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1483 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1484 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1485 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1486 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1487 v += 60; 1488 1489 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1490 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1491 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1492 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1493 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1494 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1495 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1496 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1497 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1498 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1499 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1500 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1501 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1502 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1503 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1504 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1505 v += 45; 1506 } 1507 if (usecprow) z = zarray + 15*ridx[i]; 1508 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1509 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1510 1511 if (!usecprow) z += 15; 1512 } 1513 1514 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1515 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1516 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1517 PetscFunctionReturn(0); 1518 } 1519 1520 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1521 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1522 { 1523 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1524 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1525 const PetscScalar *x,*xb; 1526 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1527 const MatScalar *v; 1528 PetscErrorCode ierr; 1529 const PetscInt *ii,*ij=a->j,*idx; 1530 PetscInt mbs,i,j,n,*ridx=NULL; 1531 PetscBool usecprow=a->compressedrow.use; 1532 1533 PetscFunctionBegin; 1534 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1535 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1536 1537 v = a->a; 1538 if (usecprow) { 1539 mbs = a->compressedrow.nrows; 1540 ii = a->compressedrow.i; 1541 ridx = a->compressedrow.rindex; 1542 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1543 } else { 1544 mbs = a->mbs; 1545 ii = a->i; 1546 z = zarray; 1547 } 1548 1549 for (i=0; i<mbs; i++) { 1550 n = ii[i+1] - ii[i]; 1551 idx = ij + ii[i]; 1552 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1553 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1554 1555 for (j=0; j<n; j++) { 1556 xb = x + 15*(idx[j]); 1557 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1558 x8 = xb[7]; 1559 1560 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1561 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1562 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1563 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1564 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1565 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1566 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1567 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1568 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1569 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1570 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1571 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1572 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1573 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1574 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1575 v += 120; 1576 1577 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1578 1579 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1580 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1581 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1582 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1583 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1584 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1585 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1586 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1587 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1588 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1589 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1590 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1591 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1592 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1593 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1594 v += 105; 1595 } 1596 if (usecprow) z = zarray + 15*ridx[i]; 1597 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1598 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1599 1600 if (!usecprow) z += 15; 1601 } 1602 1603 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1604 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1605 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1606 PetscFunctionReturn(0); 1607 } 1608 1609 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1610 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1611 { 1612 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1613 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1614 const PetscScalar *x,*xb; 1615 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1616 const MatScalar *v; 1617 PetscErrorCode ierr; 1618 const PetscInt *ii,*ij=a->j,*idx; 1619 PetscInt mbs,i,j,n,*ridx=NULL; 1620 PetscBool usecprow=a->compressedrow.use; 1621 1622 PetscFunctionBegin; 1623 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1624 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1625 1626 v = a->a; 1627 if (usecprow) { 1628 mbs = a->compressedrow.nrows; 1629 ii = a->compressedrow.i; 1630 ridx = a->compressedrow.rindex; 1631 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1632 } else { 1633 mbs = a->mbs; 1634 ii = a->i; 1635 z = zarray; 1636 } 1637 1638 for (i=0; i<mbs; i++) { 1639 n = ii[i+1] - ii[i]; 1640 idx = ij + ii[i]; 1641 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1642 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1643 1644 for (j=0; j<n; j++) { 1645 xb = x + 15*(idx[j]); 1646 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1647 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1648 1649 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1650 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1651 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1652 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1653 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1654 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1655 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1656 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1657 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1658 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1659 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1660 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1661 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1662 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1663 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1664 v += 225; 1665 } 1666 if (usecprow) z = zarray + 15*ridx[i]; 1667 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1668 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1669 1670 if (!usecprow) z += 15; 1671 } 1672 1673 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1674 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1675 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1676 PetscFunctionReturn(0); 1677 } 1678 1679 /* 1680 This will not work with MatScalar == float because it calls the BLAS 1681 */ 1682 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1683 { 1684 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1685 PetscScalar *z = NULL,*work,*workt,*zarray; 1686 const PetscScalar *x,*xb; 1687 const MatScalar *v; 1688 PetscErrorCode ierr; 1689 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1690 const PetscInt *idx,*ii,*ridx=NULL; 1691 PetscInt ncols,k; 1692 PetscBool usecprow=a->compressedrow.use; 1693 1694 PetscFunctionBegin; 1695 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1696 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1697 1698 idx = a->j; 1699 v = a->a; 1700 if (usecprow) { 1701 mbs = a->compressedrow.nrows; 1702 ii = a->compressedrow.i; 1703 ridx = a->compressedrow.rindex; 1704 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr); 1705 } else { 1706 mbs = a->mbs; 1707 ii = a->i; 1708 z = zarray; 1709 } 1710 1711 if (!a->mult_work) { 1712 k = PetscMax(A->rmap->n,A->cmap->n); 1713 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1714 } 1715 work = a->mult_work; 1716 for (i=0; i<mbs; i++) { 1717 n = ii[1] - ii[0]; ii++; 1718 ncols = n*bs; 1719 workt = work; 1720 for (j=0; j<n; j++) { 1721 xb = x + bs*(*idx++); 1722 for (k=0; k<bs; k++) workt[k] = xb[k]; 1723 workt += bs; 1724 } 1725 if (usecprow) z = zarray + bs*ridx[i]; 1726 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1727 v += n*bs2; 1728 if (!usecprow) z += bs; 1729 } 1730 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1731 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1732 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1733 PetscFunctionReturn(0); 1734 } 1735 1736 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1737 { 1738 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1739 const PetscScalar *x; 1740 PetscScalar *y,*z,sum; 1741 const MatScalar *v; 1742 PetscErrorCode ierr; 1743 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1744 const PetscInt *idx,*ii; 1745 PetscBool usecprow=a->compressedrow.use; 1746 1747 PetscFunctionBegin; 1748 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1749 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1750 1751 idx = a->j; 1752 v = a->a; 1753 if (usecprow) { 1754 if (zz != yy) { 1755 ierr = PetscArraycpy(z,y,mbs);CHKERRQ(ierr); 1756 } 1757 mbs = a->compressedrow.nrows; 1758 ii = a->compressedrow.i; 1759 ridx = a->compressedrow.rindex; 1760 } else { 1761 ii = a->i; 1762 } 1763 1764 for (i=0; i<mbs; i++) { 1765 n = ii[1] - ii[0]; 1766 ii++; 1767 if (!usecprow) { 1768 sum = y[i]; 1769 } else { 1770 sum = y[ridx[i]]; 1771 } 1772 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1773 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1774 PetscSparseDensePlusDot(sum,x,v,idx,n); 1775 v += n; 1776 idx += n; 1777 if (usecprow) { 1778 z[ridx[i]] = sum; 1779 } else { 1780 z[i] = sum; 1781 } 1782 } 1783 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1784 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1785 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1786 PetscFunctionReturn(0); 1787 } 1788 1789 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1790 { 1791 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1792 PetscScalar *y = NULL,*z = NULL,sum1,sum2; 1793 const PetscScalar *x,*xb; 1794 PetscScalar x1,x2,*yarray,*zarray; 1795 const MatScalar *v; 1796 PetscErrorCode ierr; 1797 PetscInt mbs = a->mbs,i,n,j; 1798 const PetscInt *idx,*ii,*ridx = NULL; 1799 PetscBool usecprow = a->compressedrow.use; 1800 1801 PetscFunctionBegin; 1802 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1803 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1804 1805 idx = a->j; 1806 v = a->a; 1807 if (usecprow) { 1808 if (zz != yy) { 1809 ierr = PetscArraycpy(zarray,yarray,2*mbs);CHKERRQ(ierr); 1810 } 1811 mbs = a->compressedrow.nrows; 1812 ii = a->compressedrow.i; 1813 ridx = a->compressedrow.rindex; 1814 } else { 1815 ii = a->i; 1816 y = yarray; 1817 z = zarray; 1818 } 1819 1820 for (i=0; i<mbs; i++) { 1821 n = ii[1] - ii[0]; ii++; 1822 if (usecprow) { 1823 z = zarray + 2*ridx[i]; 1824 y = yarray + 2*ridx[i]; 1825 } 1826 sum1 = y[0]; sum2 = y[1]; 1827 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1828 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1829 for (j=0; j<n; j++) { 1830 xb = x + 2*(*idx++); 1831 x1 = xb[0]; 1832 x2 = xb[1]; 1833 1834 sum1 += v[0]*x1 + v[2]*x2; 1835 sum2 += v[1]*x1 + v[3]*x2; 1836 v += 4; 1837 } 1838 z[0] = sum1; z[1] = sum2; 1839 if (!usecprow) { 1840 z += 2; y += 2; 1841 } 1842 } 1843 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1844 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1845 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1846 PetscFunctionReturn(0); 1847 } 1848 1849 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1850 { 1851 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1852 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1853 const PetscScalar *x,*xb; 1854 const MatScalar *v; 1855 PetscErrorCode ierr; 1856 PetscInt mbs = a->mbs,i,j,n; 1857 const PetscInt *idx,*ii,*ridx = NULL; 1858 PetscBool usecprow = a->compressedrow.use; 1859 1860 PetscFunctionBegin; 1861 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1862 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1863 1864 idx = a->j; 1865 v = a->a; 1866 if (usecprow) { 1867 if (zz != yy) { 1868 ierr = PetscArraycpy(zarray,yarray,3*mbs);CHKERRQ(ierr); 1869 } 1870 mbs = a->compressedrow.nrows; 1871 ii = a->compressedrow.i; 1872 ridx = a->compressedrow.rindex; 1873 } else { 1874 ii = a->i; 1875 y = yarray; 1876 z = zarray; 1877 } 1878 1879 for (i=0; i<mbs; i++) { 1880 n = ii[1] - ii[0]; ii++; 1881 if (usecprow) { 1882 z = zarray + 3*ridx[i]; 1883 y = yarray + 3*ridx[i]; 1884 } 1885 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1886 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1887 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1888 for (j=0; j<n; j++) { 1889 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1890 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1891 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1892 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1893 v += 9; 1894 } 1895 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1896 if (!usecprow) { 1897 z += 3; y += 3; 1898 } 1899 } 1900 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1901 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1902 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1903 PetscFunctionReturn(0); 1904 } 1905 1906 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1907 { 1908 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1909 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1910 const PetscScalar *x,*xb; 1911 const MatScalar *v; 1912 PetscErrorCode ierr; 1913 PetscInt mbs = a->mbs,i,j,n; 1914 const PetscInt *idx,*ii,*ridx=NULL; 1915 PetscBool usecprow=a->compressedrow.use; 1916 1917 PetscFunctionBegin; 1918 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1919 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1920 1921 idx = a->j; 1922 v = a->a; 1923 if (usecprow) { 1924 if (zz != yy) { 1925 ierr = PetscArraycpy(zarray,yarray,4*mbs);CHKERRQ(ierr); 1926 } 1927 mbs = a->compressedrow.nrows; 1928 ii = a->compressedrow.i; 1929 ridx = a->compressedrow.rindex; 1930 } else { 1931 ii = a->i; 1932 y = yarray; 1933 z = zarray; 1934 } 1935 1936 for (i=0; i<mbs; i++) { 1937 n = ii[1] - ii[0]; ii++; 1938 if (usecprow) { 1939 z = zarray + 4*ridx[i]; 1940 y = yarray + 4*ridx[i]; 1941 } 1942 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1943 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1944 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1945 for (j=0; j<n; j++) { 1946 xb = x + 4*(*idx++); 1947 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1948 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1949 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1950 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1951 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1952 v += 16; 1953 } 1954 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1955 if (!usecprow) { 1956 z += 4; y += 4; 1957 } 1958 } 1959 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1960 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1961 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1962 PetscFunctionReturn(0); 1963 } 1964 1965 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1966 { 1967 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1968 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1969 const PetscScalar *x,*xb; 1970 PetscScalar *yarray,*zarray; 1971 const MatScalar *v; 1972 PetscErrorCode ierr; 1973 PetscInt mbs = a->mbs,i,j,n; 1974 const PetscInt *idx,*ii,*ridx = NULL; 1975 PetscBool usecprow=a->compressedrow.use; 1976 1977 PetscFunctionBegin; 1978 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1979 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1980 1981 idx = a->j; 1982 v = a->a; 1983 if (usecprow) { 1984 if (zz != yy) { 1985 ierr = PetscArraycpy(zarray,yarray,5*mbs);CHKERRQ(ierr); 1986 } 1987 mbs = a->compressedrow.nrows; 1988 ii = a->compressedrow.i; 1989 ridx = a->compressedrow.rindex; 1990 } else { 1991 ii = a->i; 1992 y = yarray; 1993 z = zarray; 1994 } 1995 1996 for (i=0; i<mbs; i++) { 1997 n = ii[1] - ii[0]; ii++; 1998 if (usecprow) { 1999 z = zarray + 5*ridx[i]; 2000 y = yarray + 5*ridx[i]; 2001 } 2002 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 2003 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2004 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2005 for (j=0; j<n; j++) { 2006 xb = x + 5*(*idx++); 2007 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 2008 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2009 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2010 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2011 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2012 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2013 v += 25; 2014 } 2015 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 2016 if (!usecprow) { 2017 z += 5; y += 5; 2018 } 2019 } 2020 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2021 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2022 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 2023 PetscFunctionReturn(0); 2024 } 2025 2026 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 2027 { 2028 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2029 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; 2030 const PetscScalar *x,*xb; 2031 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 2032 const MatScalar *v; 2033 PetscErrorCode ierr; 2034 PetscInt mbs = a->mbs,i,j,n; 2035 const PetscInt *idx,*ii,*ridx=NULL; 2036 PetscBool usecprow=a->compressedrow.use; 2037 2038 PetscFunctionBegin; 2039 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2040 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2041 2042 idx = a->j; 2043 v = a->a; 2044 if (usecprow) { 2045 if (zz != yy) { 2046 ierr = PetscArraycpy(zarray,yarray,6*mbs);CHKERRQ(ierr); 2047 } 2048 mbs = a->compressedrow.nrows; 2049 ii = a->compressedrow.i; 2050 ridx = a->compressedrow.rindex; 2051 } else { 2052 ii = a->i; 2053 y = yarray; 2054 z = zarray; 2055 } 2056 2057 for (i=0; i<mbs; i++) { 2058 n = ii[1] - ii[0]; ii++; 2059 if (usecprow) { 2060 z = zarray + 6*ridx[i]; 2061 y = yarray + 6*ridx[i]; 2062 } 2063 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 2064 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2065 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2066 for (j=0; j<n; j++) { 2067 xb = x + 6*(*idx++); 2068 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 2069 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2070 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2071 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2072 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2073 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2074 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2075 v += 36; 2076 } 2077 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 2078 if (!usecprow) { 2079 z += 6; y += 6; 2080 } 2081 } 2082 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2083 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2084 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 2085 PetscFunctionReturn(0); 2086 } 2087 2088 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 2089 { 2090 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2091 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 2092 const PetscScalar *x,*xb; 2093 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 2094 const MatScalar *v; 2095 PetscErrorCode ierr; 2096 PetscInt mbs = a->mbs,i,j,n; 2097 const PetscInt *idx,*ii,*ridx = NULL; 2098 PetscBool usecprow=a->compressedrow.use; 2099 2100 PetscFunctionBegin; 2101 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2102 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2103 2104 idx = a->j; 2105 v = a->a; 2106 if (usecprow) { 2107 if (zz != yy) { 2108 ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr); 2109 } 2110 mbs = a->compressedrow.nrows; 2111 ii = a->compressedrow.i; 2112 ridx = a->compressedrow.rindex; 2113 } else { 2114 ii = a->i; 2115 y = yarray; 2116 z = zarray; 2117 } 2118 2119 for (i=0; i<mbs; i++) { 2120 n = ii[1] - ii[0]; ii++; 2121 if (usecprow) { 2122 z = zarray + 7*ridx[i]; 2123 y = yarray + 7*ridx[i]; 2124 } 2125 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2126 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2127 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2128 for (j=0; j<n; j++) { 2129 xb = x + 7*(*idx++); 2130 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 2131 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2132 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2133 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2134 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2135 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2136 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2137 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2138 v += 49; 2139 } 2140 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2141 if (!usecprow) { 2142 z += 7; y += 7; 2143 } 2144 } 2145 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2146 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2147 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 2148 PetscFunctionReturn(0); 2149 } 2150 2151 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2152 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz) 2153 { 2154 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2155 PetscScalar *z = NULL,*work,*workt,*zarray; 2156 const PetscScalar *x,*xb; 2157 const MatScalar *v; 2158 PetscErrorCode ierr; 2159 PetscInt mbs,i,j,n; 2160 PetscInt k; 2161 PetscBool usecprow=a->compressedrow.use; 2162 const PetscInt *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81; 2163 2164 __m256d a0,a1,a2,a3,a4,a5; 2165 __m256d w0,w1,w2,w3; 2166 __m256d z0,z1,z2; 2167 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 2168 2169 PetscFunctionBegin; 2170 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 2171 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2172 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 2173 2174 idx = a->j; 2175 v = a->a; 2176 if (usecprow) { 2177 mbs = a->compressedrow.nrows; 2178 ii = a->compressedrow.i; 2179 ridx = a->compressedrow.rindex; 2180 } else { 2181 mbs = a->mbs; 2182 ii = a->i; 2183 z = zarray; 2184 } 2185 2186 if (!a->mult_work) { 2187 k = PetscMax(A->rmap->n,A->cmap->n); 2188 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2189 } 2190 2191 work = a->mult_work; 2192 for (i=0; i<mbs; i++) { 2193 n = ii[1] - ii[0]; ii++; 2194 workt = work; 2195 for (j=0; j<n; j++) { 2196 xb = x + bs*(*idx++); 2197 for (k=0; k<bs; k++) workt[k] = xb[k]; 2198 workt += bs; 2199 } 2200 if (usecprow) z = zarray + bs*ridx[i]; 2201 2202 z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]); 2203 2204 for (j=0; j<n; j++) { 2205 /* first column of a */ 2206 w0 = _mm256_set1_pd(work[j*9 ]); 2207 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 2208 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 2209 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 2210 2211 /* second column of a */ 2212 w1 = _mm256_set1_pd(work[j*9+ 1]); 2213 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 2214 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 2215 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 2216 2217 /* third column of a */ 2218 w2 = _mm256_set1_pd(work[j*9+ 2]); 2219 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 2220 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 2221 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 2222 2223 /* fourth column of a */ 2224 w3 = _mm256_set1_pd(work[j*9+ 3]); 2225 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 2226 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 2227 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 2228 2229 /* fifth column of a */ 2230 w0 = _mm256_set1_pd(work[j*9+ 4]); 2231 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 2232 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 2233 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 2234 2235 /* sixth column of a */ 2236 w1 = _mm256_set1_pd(work[j*9+ 5]); 2237 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 2238 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 2239 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 2240 2241 /* seventh column of a */ 2242 w2 = _mm256_set1_pd(work[j*9+ 6]); 2243 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 2244 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 2245 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 2246 2247 /* eigth column of a */ 2248 w3 = _mm256_set1_pd(work[j*9+ 7]); 2249 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 2250 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 2251 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 2252 2253 /* ninth column of a */ 2254 w0 = _mm256_set1_pd(work[j*9+ 8]); 2255 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 2256 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 2257 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 2258 } 2259 2260 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 2261 2262 v += n*bs2; 2263 if (!usecprow) z += bs; 2264 } 2265 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2266 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2267 ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr); 2268 PetscFunctionReturn(0); 2269 } 2270 #endif 2271 2272 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz) 2273 { 2274 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2275 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 2276 const PetscScalar *x,*xb; 2277 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray; 2278 const MatScalar *v; 2279 PetscErrorCode ierr; 2280 PetscInt mbs = a->mbs,i,j,n; 2281 const PetscInt *idx,*ii,*ridx = NULL; 2282 PetscBool usecprow=a->compressedrow.use; 2283 2284 PetscFunctionBegin; 2285 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2286 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2287 2288 idx = a->j; 2289 v = a->a; 2290 if (usecprow) { 2291 if (zz != yy) { 2292 ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr); 2293 } 2294 mbs = a->compressedrow.nrows; 2295 ii = a->compressedrow.i; 2296 ridx = a->compressedrow.rindex; 2297 } else { 2298 ii = a->i; 2299 y = yarray; 2300 z = zarray; 2301 } 2302 2303 for (i=0; i<mbs; i++) { 2304 n = ii[1] - ii[0]; ii++; 2305 if (usecprow) { 2306 z = zarray + 11*ridx[i]; 2307 y = yarray + 11*ridx[i]; 2308 } 2309 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2310 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; 2311 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2312 PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2313 for (j=0; j<n; j++) { 2314 xb = x + 11*(*idx++); 2315 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; 2316 sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11; 2317 sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11; 2318 sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11; 2319 sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11; 2320 sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11; 2321 sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11; 2322 sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11; 2323 sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11; 2324 sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11; 2325 sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11; 2326 sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11; 2327 v += 121; 2328 } 2329 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2330 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 2331 if (!usecprow) { 2332 z += 11; y += 11; 2333 } 2334 } 2335 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2336 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2337 ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr); 2338 PetscFunctionReturn(0); 2339 } 2340 2341 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 2342 { 2343 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2344 PetscScalar *z = NULL,*work,*workt,*zarray; 2345 const PetscScalar *x,*xb; 2346 const MatScalar *v; 2347 PetscErrorCode ierr; 2348 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 2349 PetscInt ncols,k; 2350 const PetscInt *ridx = NULL,*idx,*ii; 2351 PetscBool usecprow = a->compressedrow.use; 2352 2353 PetscFunctionBegin; 2354 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 2355 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2356 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 2357 2358 idx = a->j; 2359 v = a->a; 2360 if (usecprow) { 2361 mbs = a->compressedrow.nrows; 2362 ii = a->compressedrow.i; 2363 ridx = a->compressedrow.rindex; 2364 } else { 2365 mbs = a->mbs; 2366 ii = a->i; 2367 z = zarray; 2368 } 2369 2370 if (!a->mult_work) { 2371 k = PetscMax(A->rmap->n,A->cmap->n); 2372 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2373 } 2374 work = a->mult_work; 2375 for (i=0; i<mbs; i++) { 2376 n = ii[1] - ii[0]; ii++; 2377 ncols = n*bs; 2378 workt = work; 2379 for (j=0; j<n; j++) { 2380 xb = x + bs*(*idx++); 2381 for (k=0; k<bs; k++) workt[k] = xb[k]; 2382 workt += bs; 2383 } 2384 if (usecprow) z = zarray + bs*ridx[i]; 2385 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 2386 v += n*bs2; 2387 if (!usecprow) z += bs; 2388 } 2389 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2390 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2391 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 2392 PetscFunctionReturn(0); 2393 } 2394 2395 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2396 { 2397 PetscScalar zero = 0.0; 2398 PetscErrorCode ierr; 2399 2400 PetscFunctionBegin; 2401 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2402 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2403 PetscFunctionReturn(0); 2404 } 2405 2406 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2407 { 2408 PetscScalar zero = 0.0; 2409 PetscErrorCode ierr; 2410 2411 PetscFunctionBegin; 2412 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2413 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2414 PetscFunctionReturn(0); 2415 } 2416 2417 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2418 { 2419 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2420 PetscScalar *z,x1,x2,x3,x4,x5; 2421 const PetscScalar *x,*xb = NULL; 2422 const MatScalar *v; 2423 PetscErrorCode ierr; 2424 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 2425 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2426 Mat_CompressedRow cprow = a->compressedrow; 2427 PetscBool usecprow = cprow.use; 2428 2429 PetscFunctionBegin; 2430 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2431 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2432 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2433 2434 idx = a->j; 2435 v = a->a; 2436 if (usecprow) { 2437 mbs = cprow.nrows; 2438 ii = cprow.i; 2439 ridx = cprow.rindex; 2440 } else { 2441 mbs=a->mbs; 2442 ii = a->i; 2443 xb = x; 2444 } 2445 2446 switch (bs) { 2447 case 1: 2448 for (i=0; i<mbs; i++) { 2449 if (usecprow) xb = x + ridx[i]; 2450 x1 = xb[0]; 2451 ib = idx + ii[0]; 2452 n = ii[1] - ii[0]; ii++; 2453 for (j=0; j<n; j++) { 2454 rval = ib[j]; 2455 z[rval] += PetscConj(*v) * x1; 2456 v++; 2457 } 2458 if (!usecprow) xb++; 2459 } 2460 break; 2461 case 2: 2462 for (i=0; i<mbs; i++) { 2463 if (usecprow) xb = x + 2*ridx[i]; 2464 x1 = xb[0]; x2 = xb[1]; 2465 ib = idx + ii[0]; 2466 n = ii[1] - ii[0]; ii++; 2467 for (j=0; j<n; j++) { 2468 rval = ib[j]*2; 2469 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2470 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2471 v += 4; 2472 } 2473 if (!usecprow) xb += 2; 2474 } 2475 break; 2476 case 3: 2477 for (i=0; i<mbs; i++) { 2478 if (usecprow) xb = x + 3*ridx[i]; 2479 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2480 ib = idx + ii[0]; 2481 n = ii[1] - ii[0]; ii++; 2482 for (j=0; j<n; j++) { 2483 rval = ib[j]*3; 2484 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2485 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2486 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2487 v += 9; 2488 } 2489 if (!usecprow) xb += 3; 2490 } 2491 break; 2492 case 4: 2493 for (i=0; i<mbs; i++) { 2494 if (usecprow) xb = x + 4*ridx[i]; 2495 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2496 ib = idx + ii[0]; 2497 n = ii[1] - ii[0]; ii++; 2498 for (j=0; j<n; j++) { 2499 rval = ib[j]*4; 2500 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2501 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2502 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2503 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2504 v += 16; 2505 } 2506 if (!usecprow) xb += 4; 2507 } 2508 break; 2509 case 5: 2510 for (i=0; i<mbs; i++) { 2511 if (usecprow) xb = x + 5*ridx[i]; 2512 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2513 x4 = xb[3]; x5 = xb[4]; 2514 ib = idx + ii[0]; 2515 n = ii[1] - ii[0]; ii++; 2516 for (j=0; j<n; j++) { 2517 rval = ib[j]*5; 2518 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2519 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2520 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2521 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2522 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2523 v += 25; 2524 } 2525 if (!usecprow) xb += 5; 2526 } 2527 break; 2528 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2529 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2530 #if 0 2531 { 2532 PetscInt ncols,k,bs2=a->bs2; 2533 PetscScalar *work,*workt,zb; 2534 const PetscScalar *xtmp; 2535 if (!a->mult_work) { 2536 k = PetscMax(A->rmap->n,A->cmap->n); 2537 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2538 } 2539 work = a->mult_work; 2540 xtmp = x; 2541 for (i=0; i<mbs; i++) { 2542 n = ii[1] - ii[0]; ii++; 2543 ncols = n*bs; 2544 ierr = PetscArrayzero(work,ncols);CHKERRQ(ierr); 2545 if (usecprow) xtmp = x + bs*ridx[i]; 2546 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2547 v += n*bs2; 2548 if (!usecprow) xtmp += bs; 2549 workt = work; 2550 for (j=0; j<n; j++) { 2551 zb = z + bs*(*idx++); 2552 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2553 workt += bs; 2554 } 2555 } 2556 } 2557 #endif 2558 } 2559 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2560 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2561 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2562 PetscFunctionReturn(0); 2563 } 2564 2565 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2566 { 2567 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2568 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2569 const PetscScalar *x,*xb = NULL; 2570 const MatScalar *v; 2571 PetscErrorCode ierr; 2572 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2573 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2574 Mat_CompressedRow cprow = a->compressedrow; 2575 PetscBool usecprow=cprow.use; 2576 2577 PetscFunctionBegin; 2578 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2579 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2580 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2581 2582 idx = a->j; 2583 v = a->a; 2584 if (usecprow) { 2585 mbs = cprow.nrows; 2586 ii = cprow.i; 2587 ridx = cprow.rindex; 2588 } else { 2589 mbs=a->mbs; 2590 ii = a->i; 2591 xb = x; 2592 } 2593 2594 switch (bs) { 2595 case 1: 2596 for (i=0; i<mbs; i++) { 2597 if (usecprow) xb = x + ridx[i]; 2598 x1 = xb[0]; 2599 ib = idx + ii[0]; 2600 n = ii[1] - ii[0]; ii++; 2601 for (j=0; j<n; j++) { 2602 rval = ib[j]; 2603 z[rval] += *v * x1; 2604 v++; 2605 } 2606 if (!usecprow) xb++; 2607 } 2608 break; 2609 case 2: 2610 for (i=0; i<mbs; i++) { 2611 if (usecprow) xb = x + 2*ridx[i]; 2612 x1 = xb[0]; x2 = xb[1]; 2613 ib = idx + ii[0]; 2614 n = ii[1] - ii[0]; ii++; 2615 for (j=0; j<n; j++) { 2616 rval = ib[j]*2; 2617 z[rval++] += v[0]*x1 + v[1]*x2; 2618 z[rval++] += v[2]*x1 + v[3]*x2; 2619 v += 4; 2620 } 2621 if (!usecprow) xb += 2; 2622 } 2623 break; 2624 case 3: 2625 for (i=0; i<mbs; i++) { 2626 if (usecprow) xb = x + 3*ridx[i]; 2627 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2628 ib = idx + ii[0]; 2629 n = ii[1] - ii[0]; ii++; 2630 for (j=0; j<n; j++) { 2631 rval = ib[j]*3; 2632 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2633 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2634 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2635 v += 9; 2636 } 2637 if (!usecprow) xb += 3; 2638 } 2639 break; 2640 case 4: 2641 for (i=0; i<mbs; i++) { 2642 if (usecprow) xb = x + 4*ridx[i]; 2643 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2644 ib = idx + ii[0]; 2645 n = ii[1] - ii[0]; ii++; 2646 for (j=0; j<n; j++) { 2647 rval = ib[j]*4; 2648 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2649 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2650 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2651 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2652 v += 16; 2653 } 2654 if (!usecprow) xb += 4; 2655 } 2656 break; 2657 case 5: 2658 for (i=0; i<mbs; i++) { 2659 if (usecprow) xb = x + 5*ridx[i]; 2660 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2661 x4 = xb[3]; x5 = xb[4]; 2662 ib = idx + ii[0]; 2663 n = ii[1] - ii[0]; ii++; 2664 for (j=0; j<n; j++) { 2665 rval = ib[j]*5; 2666 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2667 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2668 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2669 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2670 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2671 v += 25; 2672 } 2673 if (!usecprow) xb += 5; 2674 } 2675 break; 2676 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2677 PetscInt ncols,k; 2678 PetscScalar *work,*workt; 2679 const PetscScalar *xtmp; 2680 if (!a->mult_work) { 2681 k = PetscMax(A->rmap->n,A->cmap->n); 2682 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2683 } 2684 work = a->mult_work; 2685 xtmp = x; 2686 for (i=0; i<mbs; i++) { 2687 n = ii[1] - ii[0]; ii++; 2688 ncols = n*bs; 2689 ierr = PetscArrayzero(work,ncols);CHKERRQ(ierr); 2690 if (usecprow) xtmp = x + bs*ridx[i]; 2691 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2692 v += n*bs2; 2693 if (!usecprow) xtmp += bs; 2694 workt = work; 2695 for (j=0; j<n; j++) { 2696 zb = z + bs*(*idx++); 2697 for (k=0; k<bs; k++) zb[k] += workt[k]; 2698 workt += bs; 2699 } 2700 } 2701 } 2702 } 2703 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2704 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2705 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2706 PetscFunctionReturn(0); 2707 } 2708 2709 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2710 { 2711 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2712 PetscInt totalnz = a->bs2*a->nz; 2713 PetscScalar oalpha = alpha; 2714 PetscErrorCode ierr; 2715 PetscBLASInt one = 1,tnz; 2716 2717 PetscFunctionBegin; 2718 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 2719 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2720 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 2721 PetscFunctionReturn(0); 2722 } 2723 2724 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2725 { 2726 PetscErrorCode ierr; 2727 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2728 MatScalar *v = a->a; 2729 PetscReal sum = 0.0; 2730 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2731 2732 PetscFunctionBegin; 2733 if (type == NORM_FROBENIUS) { 2734 #if defined(PETSC_USE_REAL___FP16) 2735 PetscBLASInt one = 1,cnt = bs2*nz; 2736 PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one)); 2737 #else 2738 for (i=0; i<bs2*nz; i++) { 2739 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2740 } 2741 #endif 2742 *norm = PetscSqrtReal(sum); 2743 ierr = PetscLogFlops(2.0*bs2*nz);CHKERRQ(ierr); 2744 } else if (type == NORM_1) { /* maximum column sum */ 2745 PetscReal *tmp; 2746 PetscInt *bcol = a->j; 2747 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 2748 for (i=0; i<nz; i++) { 2749 for (j=0; j<bs; j++) { 2750 k1 = bs*(*bcol) + j; /* column index */ 2751 for (k=0; k<bs; k++) { 2752 tmp[k1] += PetscAbsScalar(*v); v++; 2753 } 2754 } 2755 bcol++; 2756 } 2757 *norm = 0.0; 2758 for (j=0; j<A->cmap->n; j++) { 2759 if (tmp[j] > *norm) *norm = tmp[j]; 2760 } 2761 ierr = PetscFree(tmp);CHKERRQ(ierr); 2762 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2763 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2764 *norm = 0.0; 2765 for (k=0; k<bs; k++) { 2766 for (j=0; j<a->mbs; j++) { 2767 v = a->a + bs2*a->i[j] + k; 2768 sum = 0.0; 2769 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2770 for (k1=0; k1<bs; k1++) { 2771 sum += PetscAbsScalar(*v); 2772 v += bs; 2773 } 2774 } 2775 if (sum > *norm) *norm = sum; 2776 } 2777 } 2778 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2779 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2780 PetscFunctionReturn(0); 2781 } 2782 2783 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2784 { 2785 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2786 PetscErrorCode ierr; 2787 2788 PetscFunctionBegin; 2789 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2790 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2791 *flg = PETSC_FALSE; 2792 PetscFunctionReturn(0); 2793 } 2794 2795 /* if the a->i are the same */ 2796 ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr); 2797 if (!*flg) PetscFunctionReturn(0); 2798 2799 /* if a->j are the same */ 2800 ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr); 2801 if (!*flg) PetscFunctionReturn(0); 2802 2803 /* if a->a are the same */ 2804 ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);CHKERRQ(ierr); 2805 PetscFunctionReturn(0); 2806 2807 } 2808 2809 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2810 { 2811 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2812 PetscErrorCode ierr; 2813 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2814 PetscScalar *x,zero = 0.0; 2815 MatScalar *aa,*aa_j; 2816 2817 PetscFunctionBegin; 2818 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2819 bs = A->rmap->bs; 2820 aa = a->a; 2821 ai = a->i; 2822 aj = a->j; 2823 ambs = a->mbs; 2824 bs2 = a->bs2; 2825 2826 ierr = VecSet(v,zero);CHKERRQ(ierr); 2827 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2828 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2829 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2830 for (i=0; i<ambs; i++) { 2831 for (j=ai[i]; j<ai[i+1]; j++) { 2832 if (aj[j] == i) { 2833 row = i*bs; 2834 aa_j = aa+j*bs2; 2835 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2836 break; 2837 } 2838 } 2839 } 2840 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2841 PetscFunctionReturn(0); 2842 } 2843 2844 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2845 { 2846 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2847 const PetscScalar *l,*r,*li,*ri; 2848 PetscScalar x; 2849 MatScalar *aa, *v; 2850 PetscErrorCode ierr; 2851 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2852 const PetscInt *ai,*aj; 2853 2854 PetscFunctionBegin; 2855 ai = a->i; 2856 aj = a->j; 2857 aa = a->a; 2858 m = A->rmap->n; 2859 n = A->cmap->n; 2860 bs = A->rmap->bs; 2861 mbs = a->mbs; 2862 bs2 = a->bs2; 2863 if (ll) { 2864 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2865 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2866 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2867 for (i=0; i<mbs; i++) { /* for each block row */ 2868 M = ai[i+1] - ai[i]; 2869 li = l + i*bs; 2870 v = aa + bs2*ai[i]; 2871 for (j=0; j<M; j++) { /* for each block */ 2872 for (k=0; k<bs2; k++) { 2873 (*v++) *= li[k%bs]; 2874 } 2875 } 2876 } 2877 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2878 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2879 } 2880 2881 if (rr) { 2882 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2883 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2884 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2885 for (i=0; i<mbs; i++) { /* for each block row */ 2886 iai = ai[i]; 2887 M = ai[i+1] - iai; 2888 v = aa + bs2*iai; 2889 for (j=0; j<M; j++) { /* for each block */ 2890 ri = r + bs*aj[iai+j]; 2891 for (k=0; k<bs; k++) { 2892 x = ri[k]; 2893 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2894 v += bs; 2895 } 2896 } 2897 } 2898 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2899 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2900 } 2901 PetscFunctionReturn(0); 2902 } 2903 2904 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2905 { 2906 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2907 2908 PetscFunctionBegin; 2909 info->block_size = a->bs2; 2910 info->nz_allocated = a->bs2*a->maxnz; 2911 info->nz_used = a->bs2*a->nz; 2912 info->nz_unneeded = info->nz_allocated - info->nz_used; 2913 info->assemblies = A->num_ass; 2914 info->mallocs = A->info.mallocs; 2915 info->memory = ((PetscObject)A)->mem; 2916 if (A->factortype) { 2917 info->fill_ratio_given = A->info.fill_ratio_given; 2918 info->fill_ratio_needed = A->info.fill_ratio_needed; 2919 info->factor_mallocs = A->info.factor_mallocs; 2920 } else { 2921 info->fill_ratio_given = 0; 2922 info->fill_ratio_needed = 0; 2923 info->factor_mallocs = 0; 2924 } 2925 PetscFunctionReturn(0); 2926 } 2927 2928 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2929 { 2930 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2931 PetscErrorCode ierr; 2932 2933 PetscFunctionBegin; 2934 ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr); 2935 PetscFunctionReturn(0); 2936 } 2937 2938 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 2939 { 2940 PetscErrorCode ierr; 2941 2942 PetscFunctionBegin; 2943 ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 2944 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 2945 PetscFunctionReturn(0); 2946 } 2947 2948 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2949 { 2950 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2951 PetscScalar *z = NULL,sum1; 2952 const PetscScalar *xb; 2953 PetscScalar x1; 2954 const MatScalar *v,*vv; 2955 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2956 PetscBool usecprow=a->compressedrow.use; 2957 2958 PetscFunctionBegin; 2959 idx = a->j; 2960 v = a->a; 2961 if (usecprow) { 2962 mbs = a->compressedrow.nrows; 2963 ii = a->compressedrow.i; 2964 ridx = a->compressedrow.rindex; 2965 } else { 2966 mbs = a->mbs; 2967 ii = a->i; 2968 z = c; 2969 } 2970 2971 for (i=0; i<mbs; i++) { 2972 n = ii[1] - ii[0]; ii++; 2973 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2974 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2975 if (usecprow) z = c + ridx[i]; 2976 jj = idx; 2977 vv = v; 2978 for (k=0; k<cn; k++) { 2979 idx = jj; 2980 v = vv; 2981 sum1 = 0.0; 2982 for (j=0; j<n; j++) { 2983 xb = b + (*idx++); x1 = xb[0+k*bm]; 2984 sum1 += v[0]*x1; 2985 v += 1; 2986 } 2987 z[0+k*cm] = sum1; 2988 } 2989 if (!usecprow) z += 1; 2990 } 2991 PetscFunctionReturn(0); 2992 } 2993 2994 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2995 { 2996 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2997 PetscScalar *z = NULL,sum1,sum2; 2998 const PetscScalar *xb; 2999 PetscScalar x1,x2; 3000 const MatScalar *v,*vv; 3001 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3002 PetscBool usecprow=a->compressedrow.use; 3003 3004 PetscFunctionBegin; 3005 idx = a->j; 3006 v = a->a; 3007 if (usecprow) { 3008 mbs = a->compressedrow.nrows; 3009 ii = a->compressedrow.i; 3010 ridx = a->compressedrow.rindex; 3011 } else { 3012 mbs = a->mbs; 3013 ii = a->i; 3014 z = c; 3015 } 3016 3017 for (i=0; i<mbs; i++) { 3018 n = ii[1] - ii[0]; ii++; 3019 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3020 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3021 if (usecprow) z = c + 2*ridx[i]; 3022 jj = idx; 3023 vv = v; 3024 for (k=0; k<cn; k++) { 3025 idx = jj; 3026 v = vv; 3027 sum1 = 0.0; sum2 = 0.0; 3028 for (j=0; j<n; j++) { 3029 xb = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; 3030 sum1 += v[0]*x1 + v[2]*x2; 3031 sum2 += v[1]*x1 + v[3]*x2; 3032 v += 4; 3033 } 3034 z[0+k*cm] = sum1; z[1+k*cm] = sum2; 3035 } 3036 if (!usecprow) z += 2; 3037 } 3038 PetscFunctionReturn(0); 3039 } 3040 3041 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 3042 { 3043 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3044 PetscScalar *z = NULL,sum1,sum2,sum3; 3045 const PetscScalar *xb; 3046 PetscScalar x1,x2,x3; 3047 const MatScalar *v,*vv; 3048 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3049 PetscBool usecprow=a->compressedrow.use; 3050 3051 PetscFunctionBegin; 3052 idx = a->j; 3053 v = a->a; 3054 if (usecprow) { 3055 mbs = a->compressedrow.nrows; 3056 ii = a->compressedrow.i; 3057 ridx = a->compressedrow.rindex; 3058 } else { 3059 mbs = a->mbs; 3060 ii = a->i; 3061 z = c; 3062 } 3063 3064 for (i=0; i<mbs; i++) { 3065 n = ii[1] - ii[0]; ii++; 3066 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3067 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3068 if (usecprow) z = c + 3*ridx[i]; 3069 jj = idx; 3070 vv = v; 3071 for (k=0; k<cn; k++) { 3072 idx = jj; 3073 v = vv; 3074 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 3075 for (j=0; j<n; j++) { 3076 xb = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; 3077 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 3078 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 3079 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 3080 v += 9; 3081 } 3082 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; 3083 } 3084 if (!usecprow) z += 3; 3085 } 3086 PetscFunctionReturn(0); 3087 } 3088 3089 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 3090 { 3091 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3092 PetscScalar *z = NULL,sum1,sum2,sum3,sum4; 3093 const PetscScalar *xb; 3094 PetscScalar x1,x2,x3,x4; 3095 const MatScalar *v,*vv; 3096 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3097 PetscBool usecprow=a->compressedrow.use; 3098 3099 PetscFunctionBegin; 3100 idx = a->j; 3101 v = a->a; 3102 if (usecprow) { 3103 mbs = a->compressedrow.nrows; 3104 ii = a->compressedrow.i; 3105 ridx = a->compressedrow.rindex; 3106 } else { 3107 mbs = a->mbs; 3108 ii = a->i; 3109 z = c; 3110 } 3111 3112 for (i=0; i<mbs; i++) { 3113 n = ii[1] - ii[0]; ii++; 3114 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3115 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3116 if (usecprow) z = c + 4*ridx[i]; 3117 jj = idx; 3118 vv = v; 3119 for (k=0; k<cn; k++) { 3120 idx = jj; 3121 v = vv; 3122 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 3123 for (j=0; j<n; j++) { 3124 xb = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; 3125 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3126 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3127 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3128 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3129 v += 16; 3130 } 3131 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; 3132 } 3133 if (!usecprow) z += 4; 3134 } 3135 PetscFunctionReturn(0); 3136 } 3137 3138 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 3139 { 3140 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3141 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5; 3142 const PetscScalar *xb; 3143 PetscScalar x1,x2,x3,x4,x5; 3144 const MatScalar *v,*vv; 3145 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3146 PetscBool usecprow=a->compressedrow.use; 3147 3148 PetscFunctionBegin; 3149 idx = a->j; 3150 v = a->a; 3151 if (usecprow) { 3152 mbs = a->compressedrow.nrows; 3153 ii = a->compressedrow.i; 3154 ridx = a->compressedrow.rindex; 3155 } else { 3156 mbs = a->mbs; 3157 ii = a->i; 3158 z = c; 3159 } 3160 3161 for (i=0; i<mbs; i++) { 3162 n = ii[1] - ii[0]; ii++; 3163 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3164 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3165 if (usecprow) z = c + 5*ridx[i]; 3166 jj = idx; 3167 vv = v; 3168 for (k=0; k<cn; k++) { 3169 idx = jj; 3170 v = vv; 3171 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 3172 for (j=0; j<n; j++) { 3173 xb = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm]; 3174 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3175 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3176 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3177 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3178 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3179 v += 25; 3180 } 3181 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5; 3182 } 3183 if (!usecprow) z += 5; 3184 } 3185 PetscFunctionReturn(0); 3186 } 3187 3188 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C) 3189 { 3190 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3191 Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 3192 Mat_SeqDense *cd = (Mat_SeqDense*)C->data; 3193 PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda; 3194 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 3195 PetscBLASInt bbs,bcn,bbm,bcm; 3196 PetscScalar *z = NULL; 3197 PetscScalar *c,*b; 3198 const MatScalar *v; 3199 const PetscInt *idx,*ii,*ridx=NULL; 3200 PetscScalar _DZero=0.0,_DOne=1.0; 3201 PetscBool usecprow=a->compressedrow.use; 3202 PetscErrorCode ierr; 3203 3204 PetscFunctionBegin; 3205 if (!cm || !cn) PetscFunctionReturn(0); 3206 if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n); 3207 if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n); 3208 if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n); 3209 b = bd->v; 3210 if (a->nonzerorowcnt != A->rmap->n) { 3211 ierr = MatZeroEntries(C);CHKERRQ(ierr); 3212 } 3213 ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 3214 switch (bs) { 3215 case 1: 3216 ierr = MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 3217 break; 3218 case 2: 3219 ierr = MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 3220 break; 3221 case 3: 3222 ierr = MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 3223 break; 3224 case 4: 3225 ierr = MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 3226 break; 3227 case 5: 3228 ierr = MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 3229 break; 3230 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 3231 ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr); 3232 ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr); 3233 ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr); 3234 ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr); 3235 idx = a->j; 3236 v = a->a; 3237 if (usecprow) { 3238 mbs = a->compressedrow.nrows; 3239 ii = a->compressedrow.i; 3240 ridx = a->compressedrow.rindex; 3241 } else { 3242 mbs = a->mbs; 3243 ii = a->i; 3244 z = c; 3245 } 3246 for (i=0; i<mbs; i++) { 3247 n = ii[1] - ii[0]; ii++; 3248 if (usecprow) z = c + bs*ridx[i]; 3249 if (n) { 3250 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm)); 3251 v += bs2; 3252 } 3253 for (j=1; j<n; j++) { 3254 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm)); 3255 v += bs2; 3256 } 3257 if (!usecprow) z += bs; 3258 } 3259 } 3260 ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 3261 ierr = PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);CHKERRQ(ierr); 3262 PetscFunctionReturn(0); 3263 } 3264