1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/impls/dense/seq/dense.h> 4 #include <petsc/private/kernels/blockinvert.h> 5 #include <petscbt.h> 6 #include <petscblaslapack.h> 7 8 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 9 #include <immintrin.h> 10 #endif 11 12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 15 PetscErrorCode ierr; 16 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 17 const PetscInt *idx; 18 PetscInt start,end,*ai,*aj,bs,*nidx2; 19 PetscBT table; 20 21 PetscFunctionBegin; 22 m = a->mbs; 23 ai = a->i; 24 aj = a->j; 25 bs = A->rmap->bs; 26 27 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 29 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 30 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 31 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 32 33 for (i=0; i<is_max; i++) { 34 /* Initialise the two local arrays */ 35 isz = 0; 36 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 37 38 /* Extract the indices, assume there can be duplicate entries */ 39 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 40 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 41 42 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 43 for (j=0; j<n; ++j) { 44 ival = idx[j]/bs; /* convert the indices into block indices */ 45 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 46 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++) { /* for each overlap*/ 53 n = isz; 54 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 55 row = nidx[k]; 56 start = ai[row]; 57 end = ai[row+1]; 58 for (l = start; l<end; l++) { 59 val = aj[l]; 60 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 61 } 62 } 63 } 64 /* expand the Index Set */ 65 for (j=0; j<isz; j++) { 66 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 67 } 68 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 69 } 70 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 71 ierr = PetscFree(nidx);CHKERRQ(ierr); 72 ierr = PetscFree(nidx2);CHKERRQ(ierr); 73 PetscFunctionReturn(0); 74 } 75 76 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 77 { 78 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 79 PetscErrorCode ierr; 80 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 81 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 82 const PetscInt *irow,*icol; 83 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 84 PetscInt *aj = a->j,*ai = a->i; 85 MatScalar *mat_a; 86 Mat C; 87 PetscBool flag; 88 89 PetscFunctionBegin; 90 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 91 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 92 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 93 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 94 95 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 96 ssmap = smap; 97 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 98 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 99 /* determine lens of each row */ 100 for (i=0; i<nrows; i++) { 101 kstart = ai[irow[i]]; 102 kend = kstart + a->ilen[irow[i]]; 103 lens[i] = 0; 104 for (k=kstart; k<kend; k++) { 105 if (ssmap[aj[k]]) lens[i]++; 106 } 107 } 108 /* Create and fill new matrix */ 109 if (scall == MAT_REUSE_MATRIX) { 110 c = (Mat_SeqBAIJ*)((*B)->data); 111 112 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 113 ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr); 114 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 115 ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr); 116 C = *B; 117 } else { 118 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 119 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 120 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 121 ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 122 } 123 c = (Mat_SeqBAIJ*)(C->data); 124 for (i=0; i<nrows; i++) { 125 row = irow[i]; 126 kstart = ai[row]; 127 kend = kstart + a->ilen[row]; 128 mat_i = c->i[i]; 129 mat_j = c->j + mat_i; 130 mat_a = c->a + mat_i*bs2; 131 mat_ilen = c->ilen + i; 132 for (k=kstart; k<kend; k++) { 133 if ((tcol=ssmap[a->j[k]])) { 134 *mat_j++ = tcol - 1; 135 ierr = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr); 136 mat_a += bs2; 137 (*mat_ilen)++; 138 } 139 } 140 } 141 /* sort */ 142 { 143 MatScalar *work; 144 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 145 for (i=0; i<nrows; i++) { 146 PetscInt ilen; 147 mat_i = c->i[i]; 148 mat_j = c->j + mat_i; 149 mat_a = c->a + mat_i*bs2; 150 ilen = c->ilen[i]; 151 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 152 } 153 ierr = PetscFree(work);CHKERRQ(ierr); 154 } 155 156 /* Free work space */ 157 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 158 ierr = PetscFree(smap);CHKERRQ(ierr); 159 ierr = PetscFree(lens);CHKERRQ(ierr); 160 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 161 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 162 163 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 164 *B = C; 165 PetscFunctionReturn(0); 166 } 167 168 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 169 { 170 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 171 IS is1,is2; 172 PetscErrorCode ierr; 173 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 174 const PetscInt *irow,*icol; 175 176 PetscFunctionBegin; 177 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 178 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 179 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 180 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 181 182 /* Verify if the indices corespond to each element in a block 183 and form the IS with compressed IS */ 184 maxmnbs = PetscMax(a->mbs,a->nbs); 185 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 186 ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr); 187 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 188 for (i=0; i<a->mbs; i++) { 189 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 190 } 191 count = 0; 192 for (i=0; i<nrows; i++) { 193 j = irow[i] / bs; 194 if ((vary[j]--)==bs) iary[count++] = j; 195 } 196 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 197 198 ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr); 199 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 200 for (i=0; i<a->nbs; i++) { 201 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 202 } 203 count = 0; 204 for (i=0; i<ncols; i++) { 205 j = icol[i] / bs; 206 if ((vary[j]--)==bs) iary[count++] = j; 207 } 208 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 209 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 210 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 211 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 212 213 ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 214 ierr = ISDestroy(&is1);CHKERRQ(ierr); 215 ierr = ISDestroy(&is2);CHKERRQ(ierr); 216 PetscFunctionReturn(0); 217 } 218 219 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 220 { 221 PetscErrorCode ierr; 222 Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data; 223 Mat_SubSppt *submatj = c->submatis1; 224 225 PetscFunctionBegin; 226 ierr = (*submatj->destroy)(C);CHKERRQ(ierr); 227 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 228 PetscFunctionReturn(0); 229 } 230 231 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[]) 232 { 233 PetscErrorCode ierr; 234 PetscInt i; 235 Mat C; 236 Mat_SeqBAIJ *c; 237 Mat_SubSppt *submatj; 238 239 PetscFunctionBegin; 240 for (i=0; i<n; i++) { 241 C = (*mat)[i]; 242 c = (Mat_SeqBAIJ*)C->data; 243 submatj = c->submatis1; 244 if (submatj) { 245 ierr = (*submatj->destroy)(C);CHKERRQ(ierr); 246 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 247 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 248 ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr); 249 ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr); 250 ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr); 251 } else { 252 ierr = MatDestroy(&C);CHKERRQ(ierr); 253 } 254 } 255 ierr = PetscFree(*mat);CHKERRQ(ierr); 256 PetscFunctionReturn(0); 257 } 258 259 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 260 { 261 PetscErrorCode ierr; 262 PetscInt i; 263 264 PetscFunctionBegin; 265 if (scall == MAT_INITIAL_MATRIX) { 266 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 267 } 268 269 for (i=0; i<n; i++) { 270 ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 271 } 272 PetscFunctionReturn(0); 273 } 274 275 /* -------------------------------------------------------*/ 276 /* Should check that shapes of vectors and matrices match */ 277 /* -------------------------------------------------------*/ 278 279 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 280 { 281 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 282 PetscScalar *z,sum; 283 const PetscScalar *x; 284 const MatScalar *v; 285 PetscErrorCode ierr; 286 PetscInt mbs,i,n; 287 const PetscInt *idx,*ii,*ridx=NULL; 288 PetscBool usecprow=a->compressedrow.use; 289 290 PetscFunctionBegin; 291 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 292 ierr = VecGetArrayWrite(zz,&z);CHKERRQ(ierr); 293 294 if (usecprow) { 295 mbs = a->compressedrow.nrows; 296 ii = a->compressedrow.i; 297 ridx = a->compressedrow.rindex; 298 ierr = PetscArrayzero(z,a->mbs);CHKERRQ(ierr); 299 } else { 300 mbs = a->mbs; 301 ii = a->i; 302 } 303 304 for (i=0; i<mbs; i++) { 305 n = ii[1] - ii[0]; 306 v = a->a + ii[0]; 307 idx = a->j + ii[0]; 308 ii++; 309 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 310 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 311 sum = 0.0; 312 PetscSparseDensePlusDot(sum,x,v,idx,n); 313 if (usecprow) { 314 z[ridx[i]] = sum; 315 } else { 316 z[i] = sum; 317 } 318 } 319 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 320 ierr = VecRestoreArrayWrite(zz,&z);CHKERRQ(ierr); 321 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 322 PetscFunctionReturn(0); 323 } 324 325 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 326 { 327 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 328 PetscScalar *z = NULL,sum1,sum2,*zarray; 329 const PetscScalar *x,*xb; 330 PetscScalar x1,x2; 331 const MatScalar *v; 332 PetscErrorCode ierr; 333 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 334 PetscBool usecprow=a->compressedrow.use; 335 336 PetscFunctionBegin; 337 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 338 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 339 340 idx = a->j; 341 v = a->a; 342 if (usecprow) { 343 mbs = a->compressedrow.nrows; 344 ii = a->compressedrow.i; 345 ridx = a->compressedrow.rindex; 346 ierr = PetscArrayzero(zarray,2*a->mbs);CHKERRQ(ierr); 347 } else { 348 mbs = a->mbs; 349 ii = a->i; 350 z = zarray; 351 } 352 353 for (i=0; i<mbs; i++) { 354 n = ii[1] - ii[0]; ii++; 355 sum1 = 0.0; sum2 = 0.0; 356 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 357 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 358 for (j=0; j<n; j++) { 359 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 360 sum1 += v[0]*x1 + v[2]*x2; 361 sum2 += v[1]*x1 + v[3]*x2; 362 v += 4; 363 } 364 if (usecprow) z = zarray + 2*ridx[i]; 365 z[0] = sum1; z[1] = sum2; 366 if (!usecprow) z += 2; 367 } 368 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 369 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 370 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 371 PetscFunctionReturn(0); 372 } 373 374 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 375 { 376 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 377 PetscScalar *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray; 378 const PetscScalar *x,*xb; 379 const MatScalar *v; 380 PetscErrorCode ierr; 381 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 382 PetscBool usecprow=a->compressedrow.use; 383 384 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 385 #pragma disjoint(*v,*z,*xb) 386 #endif 387 388 PetscFunctionBegin; 389 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 390 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 391 392 idx = a->j; 393 v = a->a; 394 if (usecprow) { 395 mbs = a->compressedrow.nrows; 396 ii = a->compressedrow.i; 397 ridx = a->compressedrow.rindex; 398 ierr = PetscArrayzero(zarray,3*a->mbs);CHKERRQ(ierr); 399 } else { 400 mbs = a->mbs; 401 ii = a->i; 402 z = zarray; 403 } 404 405 for (i=0; i<mbs; i++) { 406 n = ii[1] - ii[0]; ii++; 407 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 408 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 409 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 410 for (j=0; j<n; j++) { 411 xb = x + 3*(*idx++); 412 x1 = xb[0]; 413 x2 = xb[1]; 414 x3 = xb[2]; 415 416 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 417 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 418 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 419 v += 9; 420 } 421 if (usecprow) z = zarray + 3*ridx[i]; 422 z[0] = sum1; z[1] = sum2; z[2] = sum3; 423 if (!usecprow) z += 3; 424 } 425 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 426 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 427 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 428 PetscFunctionReturn(0); 429 } 430 431 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 432 { 433 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 434 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 435 const PetscScalar *x,*xb; 436 const MatScalar *v; 437 PetscErrorCode ierr; 438 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 439 PetscBool usecprow=a->compressedrow.use; 440 441 PetscFunctionBegin; 442 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 443 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 444 445 idx = a->j; 446 v = a->a; 447 if (usecprow) { 448 mbs = a->compressedrow.nrows; 449 ii = a->compressedrow.i; 450 ridx = a->compressedrow.rindex; 451 ierr = PetscArrayzero(zarray,4*a->mbs);CHKERRQ(ierr); 452 } else { 453 mbs = a->mbs; 454 ii = a->i; 455 z = zarray; 456 } 457 458 for (i=0; i<mbs; i++) { 459 n = ii[1] - ii[0]; 460 ii++; 461 sum1 = 0.0; 462 sum2 = 0.0; 463 sum3 = 0.0; 464 sum4 = 0.0; 465 466 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 467 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 468 for (j=0; j<n; j++) { 469 xb = x + 4*(*idx++); 470 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 471 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 472 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 473 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 474 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 475 v += 16; 476 } 477 if (usecprow) z = zarray + 4*ridx[i]; 478 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 479 if (!usecprow) z += 4; 480 } 481 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 482 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 483 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 484 PetscFunctionReturn(0); 485 } 486 487 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 488 { 489 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 490 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray; 491 const PetscScalar *xb,*x; 492 const MatScalar *v; 493 PetscErrorCode ierr; 494 const PetscInt *idx,*ii,*ridx=NULL; 495 PetscInt mbs,i,j,n; 496 PetscBool usecprow=a->compressedrow.use; 497 498 PetscFunctionBegin; 499 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 500 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 501 502 idx = a->j; 503 v = a->a; 504 if (usecprow) { 505 mbs = a->compressedrow.nrows; 506 ii = a->compressedrow.i; 507 ridx = a->compressedrow.rindex; 508 ierr = PetscArrayzero(zarray,5*a->mbs);CHKERRQ(ierr); 509 } else { 510 mbs = a->mbs; 511 ii = a->i; 512 z = zarray; 513 } 514 515 for (i=0; i<mbs; i++) { 516 n = ii[1] - ii[0]; ii++; 517 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 518 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 519 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 520 for (j=0; j<n; j++) { 521 xb = x + 5*(*idx++); 522 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 523 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 524 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 525 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 526 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 527 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 528 v += 25; 529 } 530 if (usecprow) z = zarray + 5*ridx[i]; 531 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 532 if (!usecprow) z += 5; 533 } 534 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 535 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 536 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 537 PetscFunctionReturn(0); 538 } 539 540 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 541 { 542 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 543 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; 544 const PetscScalar *x,*xb; 545 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 546 const MatScalar *v; 547 PetscErrorCode ierr; 548 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 549 PetscBool usecprow=a->compressedrow.use; 550 551 PetscFunctionBegin; 552 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 553 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 554 555 idx = a->j; 556 v = a->a; 557 if (usecprow) { 558 mbs = a->compressedrow.nrows; 559 ii = a->compressedrow.i; 560 ridx = a->compressedrow.rindex; 561 ierr = PetscArrayzero(zarray,6*a->mbs);CHKERRQ(ierr); 562 } else { 563 mbs = a->mbs; 564 ii = a->i; 565 z = zarray; 566 } 567 568 for (i=0; i<mbs; i++) { 569 n = ii[1] - ii[0]; 570 ii++; 571 sum1 = 0.0; 572 sum2 = 0.0; 573 sum3 = 0.0; 574 sum4 = 0.0; 575 sum5 = 0.0; 576 sum6 = 0.0; 577 578 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 579 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 580 for (j=0; j<n; j++) { 581 xb = x + 6*(*idx++); 582 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 583 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 584 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 585 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 586 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 587 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 588 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 589 v += 36; 590 } 591 if (usecprow) z = zarray + 6*ridx[i]; 592 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 593 if (!usecprow) z += 6; 594 } 595 596 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 597 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 598 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 599 PetscFunctionReturn(0); 600 } 601 602 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 603 { 604 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 605 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 606 const PetscScalar *x,*xb; 607 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 608 const MatScalar *v; 609 PetscErrorCode ierr; 610 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 611 PetscBool usecprow=a->compressedrow.use; 612 613 PetscFunctionBegin; 614 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 615 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 616 617 idx = a->j; 618 v = a->a; 619 if (usecprow) { 620 mbs = a->compressedrow.nrows; 621 ii = a->compressedrow.i; 622 ridx = a->compressedrow.rindex; 623 ierr = PetscArrayzero(zarray,7*a->mbs);CHKERRQ(ierr); 624 } else { 625 mbs = a->mbs; 626 ii = a->i; 627 z = zarray; 628 } 629 630 for (i=0; i<mbs; i++) { 631 n = ii[1] - ii[0]; 632 ii++; 633 sum1 = 0.0; 634 sum2 = 0.0; 635 sum3 = 0.0; 636 sum4 = 0.0; 637 sum5 = 0.0; 638 sum6 = 0.0; 639 sum7 = 0.0; 640 641 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 642 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 643 for (j=0; j<n; j++) { 644 xb = x + 7*(*idx++); 645 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 646 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 647 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 648 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 649 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 650 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 651 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 652 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 653 v += 49; 654 } 655 if (usecprow) z = zarray + 7*ridx[i]; 656 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 657 if (!usecprow) z += 7; 658 } 659 660 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 661 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 662 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 663 PetscFunctionReturn(0); 664 } 665 666 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 667 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz) 668 { 669 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 670 PetscScalar *z = NULL,*work,*workt,*zarray; 671 const PetscScalar *x,*xb; 672 const MatScalar *v; 673 PetscErrorCode ierr; 674 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 675 const PetscInt *idx,*ii,*ridx=NULL; 676 PetscInt k; 677 PetscBool usecprow=a->compressedrow.use; 678 679 __m256d a0,a1,a2,a3,a4,a5; 680 __m256d w0,w1,w2,w3; 681 __m256d z0,z1,z2; 682 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 683 684 PetscFunctionBegin; 685 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 686 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 687 688 idx = a->j; 689 v = a->a; 690 if (usecprow) { 691 mbs = a->compressedrow.nrows; 692 ii = a->compressedrow.i; 693 ridx = a->compressedrow.rindex; 694 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr); 695 } else { 696 mbs = a->mbs; 697 ii = a->i; 698 z = zarray; 699 } 700 701 if (!a->mult_work) { 702 k = PetscMax(A->rmap->n,A->cmap->n); 703 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 704 } 705 706 work = a->mult_work; 707 for (i=0; i<mbs; i++) { 708 n = ii[1] - ii[0]; ii++; 709 workt = work; 710 for (j=0; j<n; j++) { 711 xb = x + bs*(*idx++); 712 for (k=0; k<bs; k++) workt[k] = xb[k]; 713 workt += bs; 714 } 715 if (usecprow) z = zarray + bs*ridx[i]; 716 717 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 718 719 for (j=0; j<n; j++) { 720 /* first column of a */ 721 w0 = _mm256_set1_pd(work[j*9 ]); 722 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 723 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 724 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 725 726 /* second column of a */ 727 w1 = _mm256_set1_pd(work[j*9+ 1]); 728 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 729 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 730 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 731 732 /* third column of a */ 733 w2 = _mm256_set1_pd(work[j*9 +2]); 734 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 735 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 736 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 737 738 /* fourth column of a */ 739 w3 = _mm256_set1_pd(work[j*9+ 3]); 740 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 741 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 742 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 743 744 /* fifth column of a */ 745 w0 = _mm256_set1_pd(work[j*9+ 4]); 746 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 747 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 748 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 749 750 /* sixth column of a */ 751 w1 = _mm256_set1_pd(work[j*9+ 5]); 752 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 753 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 754 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 755 756 /* seventh column of a */ 757 w2 = _mm256_set1_pd(work[j*9+ 6]); 758 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 759 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 760 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 761 762 /* eigth column of a */ 763 w3 = _mm256_set1_pd(work[j*9+ 7]); 764 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 765 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 766 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 767 768 /* ninth column of a */ 769 w0 = _mm256_set1_pd(work[j*9+ 8]); 770 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 771 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 772 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 773 } 774 775 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 776 777 v += n*bs2; 778 if (!usecprow) z += bs; 779 } 780 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 781 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 782 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 783 PetscFunctionReturn(0); 784 } 785 #endif 786 787 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz) 788 { 789 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 790 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 791 const PetscScalar *x,*xb; 792 PetscScalar *zarray,xv; 793 const MatScalar *v; 794 PetscErrorCode ierr; 795 const PetscInt *ii,*ij=a->j,*idx; 796 PetscInt mbs,i,j,k,n,*ridx=NULL; 797 PetscBool usecprow=a->compressedrow.use; 798 799 PetscFunctionBegin; 800 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 801 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 802 803 v = a->a; 804 if (usecprow) { 805 mbs = a->compressedrow.nrows; 806 ii = a->compressedrow.i; 807 ridx = a->compressedrow.rindex; 808 ierr = PetscArrayzero(zarray,11*a->mbs);CHKERRQ(ierr); 809 } else { 810 mbs = a->mbs; 811 ii = a->i; 812 z = zarray; 813 } 814 815 for (i=0; i<mbs; i++) { 816 n = ii[i+1] - ii[i]; 817 idx = ij + ii[i]; 818 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 819 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; 820 821 for (j=0; j<n; j++) { 822 xb = x + 11*(idx[j]); 823 824 for (k=0; k<11; k++) { 825 xv = xb[k]; 826 sum1 += v[0]*xv; 827 sum2 += v[1]*xv; 828 sum3 += v[2]*xv; 829 sum4 += v[3]*xv; 830 sum5 += v[4]*xv; 831 sum6 += v[5]*xv; 832 sum7 += v[6]*xv; 833 sum8 += v[7]*xv; 834 sum9 += v[8]*xv; 835 sum10 += v[9]*xv; 836 sum11 += v[10]*xv; 837 v += 11; 838 } 839 } 840 if (usecprow) z = zarray + 11*ridx[i]; 841 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 842 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 843 844 if (!usecprow) z += 11; 845 } 846 847 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 848 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 849 ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr); 850 PetscFunctionReturn(0); 851 } 852 853 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 854 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz) 855 { 856 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 857 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 858 const PetscScalar *x,*xb; 859 PetscScalar *zarray,xv; 860 const MatScalar *v; 861 PetscErrorCode ierr; 862 const PetscInt *ii,*ij=a->j,*idx; 863 PetscInt mbs,i,j,k,n,*ridx=NULL; 864 PetscBool usecprow=a->compressedrow.use; 865 866 PetscFunctionBegin; 867 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 868 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 869 870 v = a->a; 871 if (usecprow) { 872 mbs = a->compressedrow.nrows; 873 ii = a->compressedrow.i; 874 ridx = a->compressedrow.rindex; 875 ierr = PetscArrayzero(zarray,12*a->mbs);CHKERRQ(ierr); 876 } else { 877 mbs = a->mbs; 878 ii = a->i; 879 z = zarray; 880 } 881 882 for (i=0; i<mbs; i++) { 883 n = ii[i+1] - ii[i]; 884 idx = ij + ii[i]; 885 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 886 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; 887 888 for (j=0; j<n; j++) { 889 xb = x + 12*(idx[j]); 890 891 for (k=0; k<12; k++) { 892 xv = xb[k]; 893 sum1 += v[0]*xv; 894 sum2 += v[1]*xv; 895 sum3 += v[2]*xv; 896 sum4 += v[3]*xv; 897 sum5 += v[4]*xv; 898 sum6 += v[5]*xv; 899 sum7 += v[6]*xv; 900 sum8 += v[7]*xv; 901 sum9 += v[8]*xv; 902 sum10 += v[9]*xv; 903 sum11 += v[10]*xv; 904 sum12 += v[11]*xv; 905 v += 12; 906 } 907 } 908 if (usecprow) z = zarray + 12*ridx[i]; 909 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 910 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 911 if (!usecprow) z += 12; 912 } 913 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 914 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 915 ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr); 916 PetscFunctionReturn(0); 917 } 918 919 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz) 920 { 921 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 922 PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 923 const PetscScalar *x,*xb; 924 PetscScalar *zarray,*yarray,xv; 925 const MatScalar *v; 926 PetscErrorCode ierr; 927 const PetscInt *ii,*ij=a->j,*idx; 928 PetscInt mbs = a->mbs,i,j,k,n,*ridx=NULL; 929 PetscBool usecprow=a->compressedrow.use; 930 931 PetscFunctionBegin; 932 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 933 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 934 935 v = a->a; 936 if (usecprow) { 937 if (zz != yy) { 938 ierr = PetscArraycpy(zarray,yarray,12*mbs);CHKERRQ(ierr); 939 } 940 mbs = a->compressedrow.nrows; 941 ii = a->compressedrow.i; 942 ridx = a->compressedrow.rindex; 943 } else { 944 ii = a->i; 945 y = yarray; 946 z = zarray; 947 } 948 949 for (i=0; i<mbs; i++) { 950 n = ii[i+1] - ii[i]; 951 idx = ij + ii[i]; 952 953 if (usecprow) { 954 y = yarray + 12*ridx[i]; 955 z = zarray + 12*ridx[i]; 956 } 957 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 958 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11]; 959 960 for (j=0; j<n; j++) { 961 xb = x + 12*(idx[j]); 962 963 for (k=0; k<12; k++) { 964 xv = xb[k]; 965 sum1 += v[0]*xv; 966 sum2 += v[1]*xv; 967 sum3 += v[2]*xv; 968 sum4 += v[3]*xv; 969 sum5 += v[4]*xv; 970 sum6 += v[5]*xv; 971 sum7 += v[6]*xv; 972 sum8 += v[7]*xv; 973 sum9 += v[8]*xv; 974 sum10 += v[9]*xv; 975 sum11 += v[10]*xv; 976 sum12 += v[11]*xv; 977 v += 12; 978 } 979 } 980 981 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 982 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 983 if (!usecprow) { 984 y += 12; 985 z += 12; 986 } 987 } 988 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 989 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 990 ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr); 991 PetscFunctionReturn(0); 992 } 993 994 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 995 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz) 996 { 997 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 998 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 999 const PetscScalar *x,*xb; 1000 PetscScalar x1,x2,x3,x4,*zarray; 1001 const MatScalar *v; 1002 PetscErrorCode ierr; 1003 const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL; 1004 PetscInt mbs,i,j,n; 1005 PetscBool usecprow=a->compressedrow.use; 1006 1007 PetscFunctionBegin; 1008 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1009 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1010 1011 v = a->a; 1012 if (usecprow) { 1013 mbs = a->compressedrow.nrows; 1014 ii = a->compressedrow.i; 1015 ridx = a->compressedrow.rindex; 1016 ierr = PetscArrayzero(zarray,12*a->mbs);CHKERRQ(ierr); 1017 } else { 1018 mbs = a->mbs; 1019 ii = a->i; 1020 z = zarray; 1021 } 1022 1023 for (i=0; i<mbs; i++) { 1024 n = ii[i+1] - ii[i]; 1025 idx = ij + ii[i]; 1026 1027 sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 1028 for (j=0; j<n; j++) { 1029 xb = x + 12*(idx[j]); 1030 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1031 1032 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1033 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1034 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1035 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1036 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1037 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1038 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1039 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1040 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1041 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1042 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1043 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1044 v += 48; 1045 1046 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1047 1048 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1049 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1050 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1051 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1052 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1053 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1054 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1055 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1056 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1057 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1058 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1059 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1060 v += 48; 1061 1062 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1063 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1064 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1065 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1066 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1067 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1068 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1069 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1070 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1071 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1072 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1073 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1074 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1075 v += 48; 1076 1077 } 1078 if (usecprow) z = zarray + 12*ridx[i]; 1079 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1080 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 1081 if (!usecprow) z += 12; 1082 } 1083 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1084 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1085 ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr); 1086 PetscFunctionReturn(0); 1087 } 1088 1089 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1090 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz) 1091 { 1092 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1093 PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 1094 const PetscScalar *x,*xb; 1095 PetscScalar x1,x2,x3,x4,*zarray,*yarray; 1096 const MatScalar *v; 1097 PetscErrorCode ierr; 1098 const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL; 1099 PetscInt mbs = a->mbs,i,j,n; 1100 PetscBool usecprow=a->compressedrow.use; 1101 1102 PetscFunctionBegin; 1103 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1104 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1105 1106 v = a->a; 1107 if (usecprow) { 1108 if (zz != yy) { 1109 ierr = PetscArraycpy(zarray,yarray,12*mbs);CHKERRQ(ierr); 1110 } 1111 mbs = a->compressedrow.nrows; 1112 ii = a->compressedrow.i; 1113 ridx = a->compressedrow.rindex; 1114 } else { 1115 ii = a->i; 1116 y = yarray; 1117 z = zarray; 1118 } 1119 1120 for (i=0; i<mbs; i++) { 1121 n = ii[i+1] - ii[i]; 1122 idx = ij + ii[i]; 1123 1124 if (usecprow) { 1125 y = yarray + 12*ridx[i]; 1126 z = zarray + 12*ridx[i]; 1127 } 1128 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1129 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11]; 1130 1131 for (j=0; j<n; j++) { 1132 xb = x + 12*(idx[j]); 1133 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1134 1135 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1136 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1137 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1138 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1139 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1140 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1141 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1142 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1143 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1144 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1145 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1146 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1147 v += 48; 1148 1149 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1150 1151 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1152 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1153 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1154 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1155 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1156 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1157 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1158 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1159 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1160 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1161 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1162 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1163 v += 48; 1164 1165 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1166 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1167 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1168 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1169 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1170 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1171 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1172 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1173 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1174 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1175 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1176 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1177 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1178 v += 48; 1179 1180 } 1181 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1182 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 1183 if (!usecprow) { 1184 y += 12; 1185 z += 12; 1186 } 1187 } 1188 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1189 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1190 ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr); 1191 PetscFunctionReturn(0); 1192 } 1193 1194 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1195 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz) 1196 { 1197 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1198 PetscScalar *z = NULL,*zarray; 1199 const PetscScalar *x,*work; 1200 const MatScalar *v = a->a; 1201 PetscErrorCode ierr; 1202 PetscInt mbs,i,j,n; 1203 const PetscInt *idx = a->j,*ii,*ridx=NULL; 1204 PetscBool usecprow=a->compressedrow.use; 1205 const PetscInt bs = 12, bs2 = 144; 1206 1207 __m256d a0,a1,a2,a3,a4,a5; 1208 __m256d w0,w1,w2,w3; 1209 __m256d z0,z1,z2; 1210 1211 PetscFunctionBegin; 1212 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1213 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1214 1215 if (usecprow) { 1216 mbs = a->compressedrow.nrows; 1217 ii = a->compressedrow.i; 1218 ridx = a->compressedrow.rindex; 1219 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr); 1220 } else { 1221 mbs = a->mbs; 1222 ii = a->i; 1223 z = zarray; 1224 } 1225 1226 for (i=0; i<mbs; i++) { 1227 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 1228 1229 n = ii[1] - ii[0]; ii++; 1230 for (j=0; j<n; j++) { 1231 work = x + bs*(*idx++); 1232 1233 /* first column of a */ 1234 w0 = _mm256_set1_pd(work[0]); 1235 a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0); 1236 a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1); 1237 a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2); 1238 1239 /* second column of a */ 1240 w1 = _mm256_set1_pd(work[1]); 1241 a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0); 1242 a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1); 1243 a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2); 1244 1245 /* third column of a */ 1246 w2 = _mm256_set1_pd(work[2]); 1247 a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0); 1248 a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1); 1249 a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2); 1250 1251 /* fourth column of a */ 1252 w3 = _mm256_set1_pd(work[3]); 1253 a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0); 1254 a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1); 1255 a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2); 1256 1257 /* fifth column of a */ 1258 w0 = _mm256_set1_pd(work[4]); 1259 a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0); 1260 a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1); 1261 a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2); 1262 1263 /* sixth column of a */ 1264 w1 = _mm256_set1_pd(work[5]); 1265 a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0); 1266 a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1); 1267 a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2); 1268 1269 /* seventh column of a */ 1270 w2 = _mm256_set1_pd(work[6]); 1271 a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0); 1272 a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1); 1273 a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2); 1274 1275 /* eigth column of a */ 1276 w3 = _mm256_set1_pd(work[7]); 1277 a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0); 1278 a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1); 1279 a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2); 1280 1281 /* ninth column of a */ 1282 w0 = _mm256_set1_pd(work[8]); 1283 a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0); 1284 a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1); 1285 a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2); 1286 1287 /* tenth column of a */ 1288 w1 = _mm256_set1_pd(work[9]); 1289 a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0); 1290 a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1); 1291 a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2); 1292 1293 /* eleventh column of a */ 1294 w2 = _mm256_set1_pd(work[10]); 1295 a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0); 1296 a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1); 1297 a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2); 1298 1299 /* twelveth column of a */ 1300 w3 = _mm256_set1_pd(work[11]); 1301 a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0); 1302 a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1); 1303 a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2); 1304 1305 v += bs2; 1306 } 1307 if (usecprow) z = zarray + bs*ridx[i]; 1308 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2); 1309 if (!usecprow) z += bs; 1310 } 1311 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1312 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1313 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1314 PetscFunctionReturn(0); 1315 } 1316 #endif 1317 1318 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1319 /* Default MatMult for block size 15 */ 1320 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 1321 { 1322 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1323 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1324 const PetscScalar *x,*xb; 1325 PetscScalar *zarray,xv; 1326 const MatScalar *v; 1327 PetscErrorCode ierr; 1328 const PetscInt *ii,*ij=a->j,*idx; 1329 PetscInt mbs,i,j,k,n,*ridx=NULL; 1330 PetscBool usecprow=a->compressedrow.use; 1331 1332 PetscFunctionBegin; 1333 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1334 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1335 1336 v = a->a; 1337 if (usecprow) { 1338 mbs = a->compressedrow.nrows; 1339 ii = a->compressedrow.i; 1340 ridx = a->compressedrow.rindex; 1341 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1342 } else { 1343 mbs = a->mbs; 1344 ii = a->i; 1345 z = zarray; 1346 } 1347 1348 for (i=0; i<mbs; i++) { 1349 n = ii[i+1] - ii[i]; 1350 idx = ij + ii[i]; 1351 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1352 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1353 1354 for (j=0; j<n; j++) { 1355 xb = x + 15*(idx[j]); 1356 1357 for (k=0; k<15; k++) { 1358 xv = xb[k]; 1359 sum1 += v[0]*xv; 1360 sum2 += v[1]*xv; 1361 sum3 += v[2]*xv; 1362 sum4 += v[3]*xv; 1363 sum5 += v[4]*xv; 1364 sum6 += v[5]*xv; 1365 sum7 += v[6]*xv; 1366 sum8 += v[7]*xv; 1367 sum9 += v[8]*xv; 1368 sum10 += v[9]*xv; 1369 sum11 += v[10]*xv; 1370 sum12 += v[11]*xv; 1371 sum13 += v[12]*xv; 1372 sum14 += v[13]*xv; 1373 sum15 += v[14]*xv; 1374 v += 15; 1375 } 1376 } 1377 if (usecprow) z = zarray + 15*ridx[i]; 1378 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1379 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1380 1381 if (!usecprow) z += 15; 1382 } 1383 1384 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1385 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1386 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1387 PetscFunctionReturn(0); 1388 } 1389 1390 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1391 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 1392 { 1393 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1394 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1395 const PetscScalar *x,*xb; 1396 PetscScalar x1,x2,x3,x4,*zarray; 1397 const MatScalar *v; 1398 PetscErrorCode ierr; 1399 const PetscInt *ii,*ij=a->j,*idx; 1400 PetscInt mbs,i,j,n,*ridx=NULL; 1401 PetscBool usecprow=a->compressedrow.use; 1402 1403 PetscFunctionBegin; 1404 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1405 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1406 1407 v = a->a; 1408 if (usecprow) { 1409 mbs = a->compressedrow.nrows; 1410 ii = a->compressedrow.i; 1411 ridx = a->compressedrow.rindex; 1412 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1413 } else { 1414 mbs = a->mbs; 1415 ii = a->i; 1416 z = zarray; 1417 } 1418 1419 for (i=0; i<mbs; i++) { 1420 n = ii[i+1] - ii[i]; 1421 idx = ij + ii[i]; 1422 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1423 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1424 1425 for (j=0; j<n; j++) { 1426 xb = x + 15*(idx[j]); 1427 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1428 1429 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1430 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1431 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1432 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1433 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1434 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1435 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1436 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1437 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1438 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1439 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1440 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1441 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1442 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1443 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1444 1445 v += 60; 1446 1447 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1448 1449 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1450 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1451 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1452 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1453 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1454 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1455 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1456 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1457 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1458 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1459 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1460 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1461 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1462 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1463 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1464 v += 60; 1465 1466 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1467 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1468 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1469 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1470 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1471 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1472 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1473 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1474 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1475 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1476 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1477 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1478 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1479 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1480 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1481 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1482 v += 60; 1483 1484 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1485 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1486 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1487 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1488 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1489 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1490 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1491 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1492 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1493 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1494 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1495 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1496 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1497 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1498 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1499 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1500 v += 45; 1501 } 1502 if (usecprow) z = zarray + 15*ridx[i]; 1503 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1504 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1505 1506 if (!usecprow) z += 15; 1507 } 1508 1509 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1510 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1511 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1512 PetscFunctionReturn(0); 1513 } 1514 1515 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1516 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1517 { 1518 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1519 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1520 const PetscScalar *x,*xb; 1521 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1522 const MatScalar *v; 1523 PetscErrorCode ierr; 1524 const PetscInt *ii,*ij=a->j,*idx; 1525 PetscInt mbs,i,j,n,*ridx=NULL; 1526 PetscBool usecprow=a->compressedrow.use; 1527 1528 PetscFunctionBegin; 1529 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1530 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1531 1532 v = a->a; 1533 if (usecprow) { 1534 mbs = a->compressedrow.nrows; 1535 ii = a->compressedrow.i; 1536 ridx = a->compressedrow.rindex; 1537 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1538 } else { 1539 mbs = a->mbs; 1540 ii = a->i; 1541 z = zarray; 1542 } 1543 1544 for (i=0; i<mbs; i++) { 1545 n = ii[i+1] - ii[i]; 1546 idx = ij + ii[i]; 1547 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1548 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1549 1550 for (j=0; j<n; j++) { 1551 xb = x + 15*(idx[j]); 1552 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1553 x8 = xb[7]; 1554 1555 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1556 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1557 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1558 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1559 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1560 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1561 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1562 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1563 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1564 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1565 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1566 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1567 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1568 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1569 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1570 v += 120; 1571 1572 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1573 1574 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1575 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1576 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1577 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1578 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1579 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1580 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1581 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1582 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1583 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1584 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1585 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1586 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1587 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1588 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1589 v += 105; 1590 } 1591 if (usecprow) z = zarray + 15*ridx[i]; 1592 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1593 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1594 1595 if (!usecprow) z += 15; 1596 } 1597 1598 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1599 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1600 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1601 PetscFunctionReturn(0); 1602 } 1603 1604 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1605 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1606 { 1607 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1608 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1609 const PetscScalar *x,*xb; 1610 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1611 const MatScalar *v; 1612 PetscErrorCode ierr; 1613 const PetscInt *ii,*ij=a->j,*idx; 1614 PetscInt mbs,i,j,n,*ridx=NULL; 1615 PetscBool usecprow=a->compressedrow.use; 1616 1617 PetscFunctionBegin; 1618 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1619 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1620 1621 v = a->a; 1622 if (usecprow) { 1623 mbs = a->compressedrow.nrows; 1624 ii = a->compressedrow.i; 1625 ridx = a->compressedrow.rindex; 1626 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1627 } else { 1628 mbs = a->mbs; 1629 ii = a->i; 1630 z = zarray; 1631 } 1632 1633 for (i=0; i<mbs; i++) { 1634 n = ii[i+1] - ii[i]; 1635 idx = ij + ii[i]; 1636 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1637 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1638 1639 for (j=0; j<n; j++) { 1640 xb = x + 15*(idx[j]); 1641 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1642 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1643 1644 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1645 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1646 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1647 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1648 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1649 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1650 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1651 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1652 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1653 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1654 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1655 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1656 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1657 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1658 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1659 v += 225; 1660 } 1661 if (usecprow) z = zarray + 15*ridx[i]; 1662 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1663 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1664 1665 if (!usecprow) z += 15; 1666 } 1667 1668 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1669 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1670 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1671 PetscFunctionReturn(0); 1672 } 1673 1674 /* 1675 This will not work with MatScalar == float because it calls the BLAS 1676 */ 1677 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1678 { 1679 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1680 PetscScalar *z = NULL,*work,*workt,*zarray; 1681 const PetscScalar *x,*xb; 1682 const MatScalar *v; 1683 PetscErrorCode ierr; 1684 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1685 const PetscInt *idx,*ii,*ridx=NULL; 1686 PetscInt ncols,k; 1687 PetscBool usecprow=a->compressedrow.use; 1688 1689 PetscFunctionBegin; 1690 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1691 ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1692 1693 idx = a->j; 1694 v = a->a; 1695 if (usecprow) { 1696 mbs = a->compressedrow.nrows; 1697 ii = a->compressedrow.i; 1698 ridx = a->compressedrow.rindex; 1699 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr); 1700 } else { 1701 mbs = a->mbs; 1702 ii = a->i; 1703 z = zarray; 1704 } 1705 1706 if (!a->mult_work) { 1707 k = PetscMax(A->rmap->n,A->cmap->n); 1708 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1709 } 1710 work = a->mult_work; 1711 for (i=0; i<mbs; i++) { 1712 n = ii[1] - ii[0]; ii++; 1713 ncols = n*bs; 1714 workt = work; 1715 for (j=0; j<n; j++) { 1716 xb = x + bs*(*idx++); 1717 for (k=0; k<bs; k++) workt[k] = xb[k]; 1718 workt += bs; 1719 } 1720 if (usecprow) z = zarray + bs*ridx[i]; 1721 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1722 v += n*bs2; 1723 if (!usecprow) z += bs; 1724 } 1725 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1726 ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1727 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1728 PetscFunctionReturn(0); 1729 } 1730 1731 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1732 { 1733 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1734 const PetscScalar *x; 1735 PetscScalar *y,*z,sum; 1736 const MatScalar *v; 1737 PetscErrorCode ierr; 1738 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1739 const PetscInt *idx,*ii; 1740 PetscBool usecprow=a->compressedrow.use; 1741 1742 PetscFunctionBegin; 1743 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1744 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1745 1746 idx = a->j; 1747 v = a->a; 1748 if (usecprow) { 1749 if (zz != yy) { 1750 ierr = PetscArraycpy(z,y,mbs);CHKERRQ(ierr); 1751 } 1752 mbs = a->compressedrow.nrows; 1753 ii = a->compressedrow.i; 1754 ridx = a->compressedrow.rindex; 1755 } else { 1756 ii = a->i; 1757 } 1758 1759 for (i=0; i<mbs; i++) { 1760 n = ii[1] - ii[0]; 1761 ii++; 1762 if (!usecprow) { 1763 sum = y[i]; 1764 } else { 1765 sum = y[ridx[i]]; 1766 } 1767 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1768 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1769 PetscSparseDensePlusDot(sum,x,v,idx,n); 1770 v += n; 1771 idx += n; 1772 if (usecprow) { 1773 z[ridx[i]] = sum; 1774 } else { 1775 z[i] = sum; 1776 } 1777 } 1778 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1779 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1780 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1781 PetscFunctionReturn(0); 1782 } 1783 1784 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1785 { 1786 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1787 PetscScalar *y = NULL,*z = NULL,sum1,sum2; 1788 const PetscScalar *x,*xb; 1789 PetscScalar x1,x2,*yarray,*zarray; 1790 const MatScalar *v; 1791 PetscErrorCode ierr; 1792 PetscInt mbs = a->mbs,i,n,j; 1793 const PetscInt *idx,*ii,*ridx = NULL; 1794 PetscBool usecprow = a->compressedrow.use; 1795 1796 PetscFunctionBegin; 1797 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1798 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1799 1800 idx = a->j; 1801 v = a->a; 1802 if (usecprow) { 1803 if (zz != yy) { 1804 ierr = PetscArraycpy(zarray,yarray,2*mbs);CHKERRQ(ierr); 1805 } 1806 mbs = a->compressedrow.nrows; 1807 ii = a->compressedrow.i; 1808 ridx = a->compressedrow.rindex; 1809 } else { 1810 ii = a->i; 1811 y = yarray; 1812 z = zarray; 1813 } 1814 1815 for (i=0; i<mbs; i++) { 1816 n = ii[1] - ii[0]; ii++; 1817 if (usecprow) { 1818 z = zarray + 2*ridx[i]; 1819 y = yarray + 2*ridx[i]; 1820 } 1821 sum1 = y[0]; sum2 = y[1]; 1822 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1823 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1824 for (j=0; j<n; j++) { 1825 xb = x + 2*(*idx++); 1826 x1 = xb[0]; 1827 x2 = xb[1]; 1828 1829 sum1 += v[0]*x1 + v[2]*x2; 1830 sum2 += v[1]*x1 + v[3]*x2; 1831 v += 4; 1832 } 1833 z[0] = sum1; z[1] = sum2; 1834 if (!usecprow) { 1835 z += 2; y += 2; 1836 } 1837 } 1838 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1839 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1840 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1841 PetscFunctionReturn(0); 1842 } 1843 1844 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1845 { 1846 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1847 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1848 const PetscScalar *x,*xb; 1849 const MatScalar *v; 1850 PetscErrorCode ierr; 1851 PetscInt mbs = a->mbs,i,j,n; 1852 const PetscInt *idx,*ii,*ridx = NULL; 1853 PetscBool usecprow = a->compressedrow.use; 1854 1855 PetscFunctionBegin; 1856 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1857 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1858 1859 idx = a->j; 1860 v = a->a; 1861 if (usecprow) { 1862 if (zz != yy) { 1863 ierr = PetscArraycpy(zarray,yarray,3*mbs);CHKERRQ(ierr); 1864 } 1865 mbs = a->compressedrow.nrows; 1866 ii = a->compressedrow.i; 1867 ridx = a->compressedrow.rindex; 1868 } else { 1869 ii = a->i; 1870 y = yarray; 1871 z = zarray; 1872 } 1873 1874 for (i=0; i<mbs; i++) { 1875 n = ii[1] - ii[0]; ii++; 1876 if (usecprow) { 1877 z = zarray + 3*ridx[i]; 1878 y = yarray + 3*ridx[i]; 1879 } 1880 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1881 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1882 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1883 for (j=0; j<n; j++) { 1884 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1885 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1886 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1887 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1888 v += 9; 1889 } 1890 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1891 if (!usecprow) { 1892 z += 3; y += 3; 1893 } 1894 } 1895 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1896 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1897 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1898 PetscFunctionReturn(0); 1899 } 1900 1901 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1902 { 1903 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1904 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1905 const PetscScalar *x,*xb; 1906 const MatScalar *v; 1907 PetscErrorCode ierr; 1908 PetscInt mbs = a->mbs,i,j,n; 1909 const PetscInt *idx,*ii,*ridx=NULL; 1910 PetscBool usecprow=a->compressedrow.use; 1911 1912 PetscFunctionBegin; 1913 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1914 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1915 1916 idx = a->j; 1917 v = a->a; 1918 if (usecprow) { 1919 if (zz != yy) { 1920 ierr = PetscArraycpy(zarray,yarray,4*mbs);CHKERRQ(ierr); 1921 } 1922 mbs = a->compressedrow.nrows; 1923 ii = a->compressedrow.i; 1924 ridx = a->compressedrow.rindex; 1925 } else { 1926 ii = a->i; 1927 y = yarray; 1928 z = zarray; 1929 } 1930 1931 for (i=0; i<mbs; i++) { 1932 n = ii[1] - ii[0]; ii++; 1933 if (usecprow) { 1934 z = zarray + 4*ridx[i]; 1935 y = yarray + 4*ridx[i]; 1936 } 1937 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1938 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1939 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1940 for (j=0; j<n; j++) { 1941 xb = x + 4*(*idx++); 1942 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1943 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1944 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1945 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1946 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1947 v += 16; 1948 } 1949 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1950 if (!usecprow) { 1951 z += 4; y += 4; 1952 } 1953 } 1954 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1955 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1956 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1957 PetscFunctionReturn(0); 1958 } 1959 1960 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1961 { 1962 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1963 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1964 const PetscScalar *x,*xb; 1965 PetscScalar *yarray,*zarray; 1966 const MatScalar *v; 1967 PetscErrorCode ierr; 1968 PetscInt mbs = a->mbs,i,j,n; 1969 const PetscInt *idx,*ii,*ridx = NULL; 1970 PetscBool usecprow=a->compressedrow.use; 1971 1972 PetscFunctionBegin; 1973 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1974 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1975 1976 idx = a->j; 1977 v = a->a; 1978 if (usecprow) { 1979 if (zz != yy) { 1980 ierr = PetscArraycpy(zarray,yarray,5*mbs);CHKERRQ(ierr); 1981 } 1982 mbs = a->compressedrow.nrows; 1983 ii = a->compressedrow.i; 1984 ridx = a->compressedrow.rindex; 1985 } else { 1986 ii = a->i; 1987 y = yarray; 1988 z = zarray; 1989 } 1990 1991 for (i=0; i<mbs; i++) { 1992 n = ii[1] - ii[0]; ii++; 1993 if (usecprow) { 1994 z = zarray + 5*ridx[i]; 1995 y = yarray + 5*ridx[i]; 1996 } 1997 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1998 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1999 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2000 for (j=0; j<n; j++) { 2001 xb = x + 5*(*idx++); 2002 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 2003 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2004 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2005 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2006 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2007 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2008 v += 25; 2009 } 2010 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 2011 if (!usecprow) { 2012 z += 5; y += 5; 2013 } 2014 } 2015 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2016 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2017 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 2018 PetscFunctionReturn(0); 2019 } 2020 2021 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 2022 { 2023 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2024 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; 2025 const PetscScalar *x,*xb; 2026 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 2027 const MatScalar *v; 2028 PetscErrorCode ierr; 2029 PetscInt mbs = a->mbs,i,j,n; 2030 const PetscInt *idx,*ii,*ridx=NULL; 2031 PetscBool usecprow=a->compressedrow.use; 2032 2033 PetscFunctionBegin; 2034 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2035 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2036 2037 idx = a->j; 2038 v = a->a; 2039 if (usecprow) { 2040 if (zz != yy) { 2041 ierr = PetscArraycpy(zarray,yarray,6*mbs);CHKERRQ(ierr); 2042 } 2043 mbs = a->compressedrow.nrows; 2044 ii = a->compressedrow.i; 2045 ridx = a->compressedrow.rindex; 2046 } else { 2047 ii = a->i; 2048 y = yarray; 2049 z = zarray; 2050 } 2051 2052 for (i=0; i<mbs; i++) { 2053 n = ii[1] - ii[0]; ii++; 2054 if (usecprow) { 2055 z = zarray + 6*ridx[i]; 2056 y = yarray + 6*ridx[i]; 2057 } 2058 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 2059 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2060 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2061 for (j=0; j<n; j++) { 2062 xb = x + 6*(*idx++); 2063 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 2064 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2065 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2066 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2067 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2068 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2069 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2070 v += 36; 2071 } 2072 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 2073 if (!usecprow) { 2074 z += 6; y += 6; 2075 } 2076 } 2077 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2078 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2079 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 2080 PetscFunctionReturn(0); 2081 } 2082 2083 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 2084 { 2085 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2086 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 2087 const PetscScalar *x,*xb; 2088 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 2089 const MatScalar *v; 2090 PetscErrorCode ierr; 2091 PetscInt mbs = a->mbs,i,j,n; 2092 const PetscInt *idx,*ii,*ridx = NULL; 2093 PetscBool usecprow=a->compressedrow.use; 2094 2095 PetscFunctionBegin; 2096 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2097 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2098 2099 idx = a->j; 2100 v = a->a; 2101 if (usecprow) { 2102 if (zz != yy) { 2103 ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr); 2104 } 2105 mbs = a->compressedrow.nrows; 2106 ii = a->compressedrow.i; 2107 ridx = a->compressedrow.rindex; 2108 } else { 2109 ii = a->i; 2110 y = yarray; 2111 z = zarray; 2112 } 2113 2114 for (i=0; i<mbs; i++) { 2115 n = ii[1] - ii[0]; ii++; 2116 if (usecprow) { 2117 z = zarray + 7*ridx[i]; 2118 y = yarray + 7*ridx[i]; 2119 } 2120 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2121 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2122 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2123 for (j=0; j<n; j++) { 2124 xb = x + 7*(*idx++); 2125 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 2126 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2127 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2128 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2129 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2130 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2131 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2132 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2133 v += 49; 2134 } 2135 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2136 if (!usecprow) { 2137 z += 7; y += 7; 2138 } 2139 } 2140 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2141 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2142 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 2143 PetscFunctionReturn(0); 2144 } 2145 2146 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2147 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz) 2148 { 2149 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2150 PetscScalar *z = NULL,*work,*workt,*zarray; 2151 const PetscScalar *x,*xb; 2152 const MatScalar *v; 2153 PetscErrorCode ierr; 2154 PetscInt mbs,i,j,n; 2155 PetscInt k; 2156 PetscBool usecprow=a->compressedrow.use; 2157 const PetscInt *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81; 2158 2159 __m256d a0,a1,a2,a3,a4,a5; 2160 __m256d w0,w1,w2,w3; 2161 __m256d z0,z1,z2; 2162 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 2163 2164 PetscFunctionBegin; 2165 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 2166 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2167 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 2168 2169 idx = a->j; 2170 v = a->a; 2171 if (usecprow) { 2172 mbs = a->compressedrow.nrows; 2173 ii = a->compressedrow.i; 2174 ridx = a->compressedrow.rindex; 2175 } else { 2176 mbs = a->mbs; 2177 ii = a->i; 2178 z = zarray; 2179 } 2180 2181 if (!a->mult_work) { 2182 k = PetscMax(A->rmap->n,A->cmap->n); 2183 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2184 } 2185 2186 work = a->mult_work; 2187 for (i=0; i<mbs; i++) { 2188 n = ii[1] - ii[0]; ii++; 2189 workt = work; 2190 for (j=0; j<n; j++) { 2191 xb = x + bs*(*idx++); 2192 for (k=0; k<bs; k++) workt[k] = xb[k]; 2193 workt += bs; 2194 } 2195 if (usecprow) z = zarray + bs*ridx[i]; 2196 2197 z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]); 2198 2199 for (j=0; j<n; j++) { 2200 /* first column of a */ 2201 w0 = _mm256_set1_pd(work[j*9 ]); 2202 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 2203 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 2204 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 2205 2206 /* second column of a */ 2207 w1 = _mm256_set1_pd(work[j*9+ 1]); 2208 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 2209 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 2210 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 2211 2212 /* third column of a */ 2213 w2 = _mm256_set1_pd(work[j*9+ 2]); 2214 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 2215 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 2216 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 2217 2218 /* fourth column of a */ 2219 w3 = _mm256_set1_pd(work[j*9+ 3]); 2220 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 2221 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 2222 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 2223 2224 /* fifth column of a */ 2225 w0 = _mm256_set1_pd(work[j*9+ 4]); 2226 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 2227 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 2228 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 2229 2230 /* sixth column of a */ 2231 w1 = _mm256_set1_pd(work[j*9+ 5]); 2232 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 2233 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 2234 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 2235 2236 /* seventh column of a */ 2237 w2 = _mm256_set1_pd(work[j*9+ 6]); 2238 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 2239 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 2240 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 2241 2242 /* eigth column of a */ 2243 w3 = _mm256_set1_pd(work[j*9+ 7]); 2244 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 2245 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 2246 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 2247 2248 /* ninth column of a */ 2249 w0 = _mm256_set1_pd(work[j*9+ 8]); 2250 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 2251 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 2252 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 2253 } 2254 2255 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 2256 2257 v += n*bs2; 2258 if (!usecprow) z += bs; 2259 } 2260 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2261 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2262 ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr); 2263 PetscFunctionReturn(0); 2264 } 2265 #endif 2266 2267 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz) 2268 { 2269 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2270 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 2271 const PetscScalar *x,*xb; 2272 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray; 2273 const MatScalar *v; 2274 PetscErrorCode ierr; 2275 PetscInt mbs = a->mbs,i,j,n; 2276 const PetscInt *idx,*ii,*ridx = NULL; 2277 PetscBool usecprow=a->compressedrow.use; 2278 2279 PetscFunctionBegin; 2280 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2281 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2282 2283 idx = a->j; 2284 v = a->a; 2285 if (usecprow) { 2286 if (zz != yy) { 2287 ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr); 2288 } 2289 mbs = a->compressedrow.nrows; 2290 ii = a->compressedrow.i; 2291 ridx = a->compressedrow.rindex; 2292 } else { 2293 ii = a->i; 2294 y = yarray; 2295 z = zarray; 2296 } 2297 2298 for (i=0; i<mbs; i++) { 2299 n = ii[1] - ii[0]; ii++; 2300 if (usecprow) { 2301 z = zarray + 11*ridx[i]; 2302 y = yarray + 11*ridx[i]; 2303 } 2304 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2305 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; 2306 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2307 PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2308 for (j=0; j<n; j++) { 2309 xb = x + 11*(*idx++); 2310 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; 2311 sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11; 2312 sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11; 2313 sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11; 2314 sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11; 2315 sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11; 2316 sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11; 2317 sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11; 2318 sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11; 2319 sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11; 2320 sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11; 2321 sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11; 2322 v += 121; 2323 } 2324 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2325 z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10; 2326 if (!usecprow) { 2327 z += 11; y += 11; 2328 } 2329 } 2330 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2331 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2332 ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr); 2333 PetscFunctionReturn(0); 2334 } 2335 2336 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 2337 { 2338 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2339 PetscScalar *z = NULL,*work,*workt,*zarray; 2340 const PetscScalar *x,*xb; 2341 const MatScalar *v; 2342 PetscErrorCode ierr; 2343 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 2344 PetscInt ncols,k; 2345 const PetscInt *ridx = NULL,*idx,*ii; 2346 PetscBool usecprow = a->compressedrow.use; 2347 2348 PetscFunctionBegin; 2349 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 2350 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2351 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 2352 2353 idx = a->j; 2354 v = a->a; 2355 if (usecprow) { 2356 mbs = a->compressedrow.nrows; 2357 ii = a->compressedrow.i; 2358 ridx = a->compressedrow.rindex; 2359 } else { 2360 mbs = a->mbs; 2361 ii = a->i; 2362 z = zarray; 2363 } 2364 2365 if (!a->mult_work) { 2366 k = PetscMax(A->rmap->n,A->cmap->n); 2367 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2368 } 2369 work = a->mult_work; 2370 for (i=0; i<mbs; i++) { 2371 n = ii[1] - ii[0]; ii++; 2372 ncols = n*bs; 2373 workt = work; 2374 for (j=0; j<n; j++) { 2375 xb = x + bs*(*idx++); 2376 for (k=0; k<bs; k++) workt[k] = xb[k]; 2377 workt += bs; 2378 } 2379 if (usecprow) z = zarray + bs*ridx[i]; 2380 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 2381 v += n*bs2; 2382 if (!usecprow) z += bs; 2383 } 2384 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2385 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2386 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 2387 PetscFunctionReturn(0); 2388 } 2389 2390 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2391 { 2392 PetscScalar zero = 0.0; 2393 PetscErrorCode ierr; 2394 2395 PetscFunctionBegin; 2396 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2397 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2398 PetscFunctionReturn(0); 2399 } 2400 2401 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2402 { 2403 PetscScalar zero = 0.0; 2404 PetscErrorCode ierr; 2405 2406 PetscFunctionBegin; 2407 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2408 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2409 PetscFunctionReturn(0); 2410 } 2411 2412 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2413 { 2414 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2415 PetscScalar *z,x1,x2,x3,x4,x5; 2416 const PetscScalar *x,*xb = NULL; 2417 const MatScalar *v; 2418 PetscErrorCode ierr; 2419 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 2420 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2421 Mat_CompressedRow cprow = a->compressedrow; 2422 PetscBool usecprow = cprow.use; 2423 2424 PetscFunctionBegin; 2425 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2426 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2427 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2428 2429 idx = a->j; 2430 v = a->a; 2431 if (usecprow) { 2432 mbs = cprow.nrows; 2433 ii = cprow.i; 2434 ridx = cprow.rindex; 2435 } else { 2436 mbs=a->mbs; 2437 ii = a->i; 2438 xb = x; 2439 } 2440 2441 switch (bs) { 2442 case 1: 2443 for (i=0; i<mbs; i++) { 2444 if (usecprow) xb = x + ridx[i]; 2445 x1 = xb[0]; 2446 ib = idx + ii[0]; 2447 n = ii[1] - ii[0]; ii++; 2448 for (j=0; j<n; j++) { 2449 rval = ib[j]; 2450 z[rval] += PetscConj(*v) * x1; 2451 v++; 2452 } 2453 if (!usecprow) xb++; 2454 } 2455 break; 2456 case 2: 2457 for (i=0; i<mbs; i++) { 2458 if (usecprow) xb = x + 2*ridx[i]; 2459 x1 = xb[0]; x2 = xb[1]; 2460 ib = idx + ii[0]; 2461 n = ii[1] - ii[0]; ii++; 2462 for (j=0; j<n; j++) { 2463 rval = ib[j]*2; 2464 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2465 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2466 v += 4; 2467 } 2468 if (!usecprow) xb += 2; 2469 } 2470 break; 2471 case 3: 2472 for (i=0; i<mbs; i++) { 2473 if (usecprow) xb = x + 3*ridx[i]; 2474 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2475 ib = idx + ii[0]; 2476 n = ii[1] - ii[0]; ii++; 2477 for (j=0; j<n; j++) { 2478 rval = ib[j]*3; 2479 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2480 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2481 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2482 v += 9; 2483 } 2484 if (!usecprow) xb += 3; 2485 } 2486 break; 2487 case 4: 2488 for (i=0; i<mbs; i++) { 2489 if (usecprow) xb = x + 4*ridx[i]; 2490 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2491 ib = idx + ii[0]; 2492 n = ii[1] - ii[0]; ii++; 2493 for (j=0; j<n; j++) { 2494 rval = ib[j]*4; 2495 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2496 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2497 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2498 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2499 v += 16; 2500 } 2501 if (!usecprow) xb += 4; 2502 } 2503 break; 2504 case 5: 2505 for (i=0; i<mbs; i++) { 2506 if (usecprow) xb = x + 5*ridx[i]; 2507 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2508 x4 = xb[3]; x5 = xb[4]; 2509 ib = idx + ii[0]; 2510 n = ii[1] - ii[0]; ii++; 2511 for (j=0; j<n; j++) { 2512 rval = ib[j]*5; 2513 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2514 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2515 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2516 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2517 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2518 v += 25; 2519 } 2520 if (!usecprow) xb += 5; 2521 } 2522 break; 2523 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2524 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2525 #if 0 2526 { 2527 PetscInt ncols,k,bs2=a->bs2; 2528 PetscScalar *work,*workt,zb; 2529 const PetscScalar *xtmp; 2530 if (!a->mult_work) { 2531 k = PetscMax(A->rmap->n,A->cmap->n); 2532 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2533 } 2534 work = a->mult_work; 2535 xtmp = x; 2536 for (i=0; i<mbs; i++) { 2537 n = ii[1] - ii[0]; ii++; 2538 ncols = n*bs; 2539 ierr = PetscArrayzero(work,ncols);CHKERRQ(ierr); 2540 if (usecprow) xtmp = x + bs*ridx[i]; 2541 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2542 v += n*bs2; 2543 if (!usecprow) xtmp += bs; 2544 workt = work; 2545 for (j=0; j<n; j++) { 2546 zb = z + bs*(*idx++); 2547 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2548 workt += bs; 2549 } 2550 } 2551 } 2552 #endif 2553 } 2554 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2555 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2556 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2557 PetscFunctionReturn(0); 2558 } 2559 2560 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2561 { 2562 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2563 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2564 const PetscScalar *x,*xb = NULL; 2565 const MatScalar *v; 2566 PetscErrorCode ierr; 2567 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2568 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2569 Mat_CompressedRow cprow = a->compressedrow; 2570 PetscBool usecprow=cprow.use; 2571 2572 PetscFunctionBegin; 2573 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2574 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2575 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2576 2577 idx = a->j; 2578 v = a->a; 2579 if (usecprow) { 2580 mbs = cprow.nrows; 2581 ii = cprow.i; 2582 ridx = cprow.rindex; 2583 } else { 2584 mbs=a->mbs; 2585 ii = a->i; 2586 xb = x; 2587 } 2588 2589 switch (bs) { 2590 case 1: 2591 for (i=0; i<mbs; i++) { 2592 if (usecprow) xb = x + ridx[i]; 2593 x1 = xb[0]; 2594 ib = idx + ii[0]; 2595 n = ii[1] - ii[0]; ii++; 2596 for (j=0; j<n; j++) { 2597 rval = ib[j]; 2598 z[rval] += *v * x1; 2599 v++; 2600 } 2601 if (!usecprow) xb++; 2602 } 2603 break; 2604 case 2: 2605 for (i=0; i<mbs; i++) { 2606 if (usecprow) xb = x + 2*ridx[i]; 2607 x1 = xb[0]; x2 = xb[1]; 2608 ib = idx + ii[0]; 2609 n = ii[1] - ii[0]; ii++; 2610 for (j=0; j<n; j++) { 2611 rval = ib[j]*2; 2612 z[rval++] += v[0]*x1 + v[1]*x2; 2613 z[rval++] += v[2]*x1 + v[3]*x2; 2614 v += 4; 2615 } 2616 if (!usecprow) xb += 2; 2617 } 2618 break; 2619 case 3: 2620 for (i=0; i<mbs; i++) { 2621 if (usecprow) xb = x + 3*ridx[i]; 2622 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2623 ib = idx + ii[0]; 2624 n = ii[1] - ii[0]; ii++; 2625 for (j=0; j<n; j++) { 2626 rval = ib[j]*3; 2627 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2628 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2629 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2630 v += 9; 2631 } 2632 if (!usecprow) xb += 3; 2633 } 2634 break; 2635 case 4: 2636 for (i=0; i<mbs; i++) { 2637 if (usecprow) xb = x + 4*ridx[i]; 2638 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2639 ib = idx + ii[0]; 2640 n = ii[1] - ii[0]; ii++; 2641 for (j=0; j<n; j++) { 2642 rval = ib[j]*4; 2643 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2644 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2645 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2646 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2647 v += 16; 2648 } 2649 if (!usecprow) xb += 4; 2650 } 2651 break; 2652 case 5: 2653 for (i=0; i<mbs; i++) { 2654 if (usecprow) xb = x + 5*ridx[i]; 2655 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2656 x4 = xb[3]; x5 = xb[4]; 2657 ib = idx + ii[0]; 2658 n = ii[1] - ii[0]; ii++; 2659 for (j=0; j<n; j++) { 2660 rval = ib[j]*5; 2661 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2662 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2663 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2664 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2665 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2666 v += 25; 2667 } 2668 if (!usecprow) xb += 5; 2669 } 2670 break; 2671 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2672 PetscInt ncols,k; 2673 PetscScalar *work,*workt; 2674 const PetscScalar *xtmp; 2675 if (!a->mult_work) { 2676 k = PetscMax(A->rmap->n,A->cmap->n); 2677 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2678 } 2679 work = a->mult_work; 2680 xtmp = x; 2681 for (i=0; i<mbs; i++) { 2682 n = ii[1] - ii[0]; ii++; 2683 ncols = n*bs; 2684 ierr = PetscArrayzero(work,ncols);CHKERRQ(ierr); 2685 if (usecprow) xtmp = x + bs*ridx[i]; 2686 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2687 v += n*bs2; 2688 if (!usecprow) xtmp += bs; 2689 workt = work; 2690 for (j=0; j<n; j++) { 2691 zb = z + bs*(*idx++); 2692 for (k=0; k<bs; k++) zb[k] += workt[k]; 2693 workt += bs; 2694 } 2695 } 2696 } 2697 } 2698 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2699 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2700 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2701 PetscFunctionReturn(0); 2702 } 2703 2704 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2705 { 2706 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2707 PetscInt totalnz = a->bs2*a->nz; 2708 PetscScalar oalpha = alpha; 2709 PetscErrorCode ierr; 2710 PetscBLASInt one = 1,tnz; 2711 2712 PetscFunctionBegin; 2713 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 2714 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2715 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 2716 PetscFunctionReturn(0); 2717 } 2718 2719 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2720 { 2721 PetscErrorCode ierr; 2722 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2723 MatScalar *v = a->a; 2724 PetscReal sum = 0.0; 2725 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2726 2727 PetscFunctionBegin; 2728 if (type == NORM_FROBENIUS) { 2729 #if defined(PETSC_USE_REAL___FP16) 2730 PetscBLASInt one = 1,cnt = bs2*nz; 2731 PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one)); 2732 #else 2733 for (i=0; i<bs2*nz; i++) { 2734 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2735 } 2736 #endif 2737 *norm = PetscSqrtReal(sum); 2738 ierr = PetscLogFlops(2.0*bs2*nz);CHKERRQ(ierr); 2739 } else if (type == NORM_1) { /* maximum column sum */ 2740 PetscReal *tmp; 2741 PetscInt *bcol = a->j; 2742 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 2743 for (i=0; i<nz; i++) { 2744 for (j=0; j<bs; j++) { 2745 k1 = bs*(*bcol) + j; /* column index */ 2746 for (k=0; k<bs; k++) { 2747 tmp[k1] += PetscAbsScalar(*v); v++; 2748 } 2749 } 2750 bcol++; 2751 } 2752 *norm = 0.0; 2753 for (j=0; j<A->cmap->n; j++) { 2754 if (tmp[j] > *norm) *norm = tmp[j]; 2755 } 2756 ierr = PetscFree(tmp);CHKERRQ(ierr); 2757 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2758 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2759 *norm = 0.0; 2760 for (k=0; k<bs; k++) { 2761 for (j=0; j<a->mbs; j++) { 2762 v = a->a + bs2*a->i[j] + k; 2763 sum = 0.0; 2764 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2765 for (k1=0; k1<bs; k1++) { 2766 sum += PetscAbsScalar(*v); 2767 v += bs; 2768 } 2769 } 2770 if (sum > *norm) *norm = sum; 2771 } 2772 } 2773 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2774 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2775 PetscFunctionReturn(0); 2776 } 2777 2778 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2779 { 2780 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2781 PetscErrorCode ierr; 2782 2783 PetscFunctionBegin; 2784 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2785 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2786 *flg = PETSC_FALSE; 2787 PetscFunctionReturn(0); 2788 } 2789 2790 /* if the a->i are the same */ 2791 ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr); 2792 if (!*flg) PetscFunctionReturn(0); 2793 2794 /* if a->j are the same */ 2795 ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr); 2796 if (!*flg) PetscFunctionReturn(0); 2797 2798 /* if a->a are the same */ 2799 ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);CHKERRQ(ierr); 2800 PetscFunctionReturn(0); 2801 2802 } 2803 2804 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2805 { 2806 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2807 PetscErrorCode ierr; 2808 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2809 PetscScalar *x,zero = 0.0; 2810 MatScalar *aa,*aa_j; 2811 2812 PetscFunctionBegin; 2813 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2814 bs = A->rmap->bs; 2815 aa = a->a; 2816 ai = a->i; 2817 aj = a->j; 2818 ambs = a->mbs; 2819 bs2 = a->bs2; 2820 2821 ierr = VecSet(v,zero);CHKERRQ(ierr); 2822 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2823 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2824 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2825 for (i=0; i<ambs; i++) { 2826 for (j=ai[i]; j<ai[i+1]; j++) { 2827 if (aj[j] == i) { 2828 row = i*bs; 2829 aa_j = aa+j*bs2; 2830 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2831 break; 2832 } 2833 } 2834 } 2835 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2836 PetscFunctionReturn(0); 2837 } 2838 2839 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2840 { 2841 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2842 const PetscScalar *l,*r,*li,*ri; 2843 PetscScalar x; 2844 MatScalar *aa, *v; 2845 PetscErrorCode ierr; 2846 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2847 const PetscInt *ai,*aj; 2848 2849 PetscFunctionBegin; 2850 ai = a->i; 2851 aj = a->j; 2852 aa = a->a; 2853 m = A->rmap->n; 2854 n = A->cmap->n; 2855 bs = A->rmap->bs; 2856 mbs = a->mbs; 2857 bs2 = a->bs2; 2858 if (ll) { 2859 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2860 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2861 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2862 for (i=0; i<mbs; i++) { /* for each block row */ 2863 M = ai[i+1] - ai[i]; 2864 li = l + i*bs; 2865 v = aa + bs2*ai[i]; 2866 for (j=0; j<M; j++) { /* for each block */ 2867 for (k=0; k<bs2; k++) { 2868 (*v++) *= li[k%bs]; 2869 } 2870 } 2871 } 2872 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2873 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2874 } 2875 2876 if (rr) { 2877 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2878 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2879 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2880 for (i=0; i<mbs; i++) { /* for each block row */ 2881 iai = ai[i]; 2882 M = ai[i+1] - iai; 2883 v = aa + bs2*iai; 2884 for (j=0; j<M; j++) { /* for each block */ 2885 ri = r + bs*aj[iai+j]; 2886 for (k=0; k<bs; k++) { 2887 x = ri[k]; 2888 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2889 v += bs; 2890 } 2891 } 2892 } 2893 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2894 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2895 } 2896 PetscFunctionReturn(0); 2897 } 2898 2899 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2900 { 2901 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2902 2903 PetscFunctionBegin; 2904 info->block_size = a->bs2; 2905 info->nz_allocated = a->bs2*a->maxnz; 2906 info->nz_used = a->bs2*a->nz; 2907 info->nz_unneeded = info->nz_allocated - info->nz_used; 2908 info->assemblies = A->num_ass; 2909 info->mallocs = A->info.mallocs; 2910 info->memory = ((PetscObject)A)->mem; 2911 if (A->factortype) { 2912 info->fill_ratio_given = A->info.fill_ratio_given; 2913 info->fill_ratio_needed = A->info.fill_ratio_needed; 2914 info->factor_mallocs = A->info.factor_mallocs; 2915 } else { 2916 info->fill_ratio_given = 0; 2917 info->fill_ratio_needed = 0; 2918 info->factor_mallocs = 0; 2919 } 2920 PetscFunctionReturn(0); 2921 } 2922 2923 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2924 { 2925 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2926 PetscErrorCode ierr; 2927 2928 PetscFunctionBegin; 2929 ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr); 2930 PetscFunctionReturn(0); 2931 } 2932 2933 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 2934 { 2935 PetscErrorCode ierr; 2936 2937 PetscFunctionBegin; 2938 ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 2939 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 2940 PetscFunctionReturn(0); 2941 } 2942 2943 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2944 { 2945 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2946 PetscScalar *z = NULL,sum1; 2947 const PetscScalar *xb; 2948 PetscScalar x1; 2949 const MatScalar *v,*vv; 2950 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2951 PetscBool usecprow=a->compressedrow.use; 2952 2953 PetscFunctionBegin; 2954 idx = a->j; 2955 v = a->a; 2956 if (usecprow) { 2957 mbs = a->compressedrow.nrows; 2958 ii = a->compressedrow.i; 2959 ridx = a->compressedrow.rindex; 2960 } else { 2961 mbs = a->mbs; 2962 ii = a->i; 2963 z = c; 2964 } 2965 2966 for (i=0; i<mbs; i++) { 2967 n = ii[1] - ii[0]; ii++; 2968 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2969 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2970 if (usecprow) z = c + ridx[i]; 2971 jj = idx; 2972 vv = v; 2973 for (k=0; k<cn; k++) { 2974 idx = jj; 2975 v = vv; 2976 sum1 = 0.0; 2977 for (j=0; j<n; j++) { 2978 xb = b + (*idx++); x1 = xb[0+k*bm]; 2979 sum1 += v[0]*x1; 2980 v += 1; 2981 } 2982 z[0+k*cm] = sum1; 2983 } 2984 if (!usecprow) z += 1; 2985 } 2986 PetscFunctionReturn(0); 2987 } 2988 2989 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2990 { 2991 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2992 PetscScalar *z = NULL,sum1,sum2; 2993 const PetscScalar *xb; 2994 PetscScalar x1,x2; 2995 const MatScalar *v,*vv; 2996 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2997 PetscBool usecprow=a->compressedrow.use; 2998 2999 PetscFunctionBegin; 3000 idx = a->j; 3001 v = a->a; 3002 if (usecprow) { 3003 mbs = a->compressedrow.nrows; 3004 ii = a->compressedrow.i; 3005 ridx = a->compressedrow.rindex; 3006 } else { 3007 mbs = a->mbs; 3008 ii = a->i; 3009 z = c; 3010 } 3011 3012 for (i=0; i<mbs; i++) { 3013 n = ii[1] - ii[0]; ii++; 3014 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3015 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3016 if (usecprow) z = c + 2*ridx[i]; 3017 jj = idx; 3018 vv = v; 3019 for (k=0; k<cn; k++) { 3020 idx = jj; 3021 v = vv; 3022 sum1 = 0.0; sum2 = 0.0; 3023 for (j=0; j<n; j++) { 3024 xb = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; 3025 sum1 += v[0]*x1 + v[2]*x2; 3026 sum2 += v[1]*x1 + v[3]*x2; 3027 v += 4; 3028 } 3029 z[0+k*cm] = sum1; z[1+k*cm] = sum2; 3030 } 3031 if (!usecprow) z += 2; 3032 } 3033 PetscFunctionReturn(0); 3034 } 3035 3036 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 3037 { 3038 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3039 PetscScalar *z = NULL,sum1,sum2,sum3; 3040 const PetscScalar *xb; 3041 PetscScalar x1,x2,x3; 3042 const MatScalar *v,*vv; 3043 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3044 PetscBool usecprow=a->compressedrow.use; 3045 3046 PetscFunctionBegin; 3047 idx = a->j; 3048 v = a->a; 3049 if (usecprow) { 3050 mbs = a->compressedrow.nrows; 3051 ii = a->compressedrow.i; 3052 ridx = a->compressedrow.rindex; 3053 } else { 3054 mbs = a->mbs; 3055 ii = a->i; 3056 z = c; 3057 } 3058 3059 for (i=0; i<mbs; i++) { 3060 n = ii[1] - ii[0]; ii++; 3061 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3062 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3063 if (usecprow) z = c + 3*ridx[i]; 3064 jj = idx; 3065 vv = v; 3066 for (k=0; k<cn; k++) { 3067 idx = jj; 3068 v = vv; 3069 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 3070 for (j=0; j<n; j++) { 3071 xb = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; 3072 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 3073 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 3074 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 3075 v += 9; 3076 } 3077 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; 3078 } 3079 if (!usecprow) z += 3; 3080 } 3081 PetscFunctionReturn(0); 3082 } 3083 3084 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 3085 { 3086 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3087 PetscScalar *z = NULL,sum1,sum2,sum3,sum4; 3088 const PetscScalar *xb; 3089 PetscScalar x1,x2,x3,x4; 3090 const MatScalar *v,*vv; 3091 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3092 PetscBool usecprow=a->compressedrow.use; 3093 3094 PetscFunctionBegin; 3095 idx = a->j; 3096 v = a->a; 3097 if (usecprow) { 3098 mbs = a->compressedrow.nrows; 3099 ii = a->compressedrow.i; 3100 ridx = a->compressedrow.rindex; 3101 } else { 3102 mbs = a->mbs; 3103 ii = a->i; 3104 z = c; 3105 } 3106 3107 for (i=0; i<mbs; i++) { 3108 n = ii[1] - ii[0]; ii++; 3109 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3110 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3111 if (usecprow) z = c + 4*ridx[i]; 3112 jj = idx; 3113 vv = v; 3114 for (k=0; k<cn; k++) { 3115 idx = jj; 3116 v = vv; 3117 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 3118 for (j=0; j<n; j++) { 3119 xb = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; 3120 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3121 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3122 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3123 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3124 v += 16; 3125 } 3126 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; 3127 } 3128 if (!usecprow) z += 4; 3129 } 3130 PetscFunctionReturn(0); 3131 } 3132 3133 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 3134 { 3135 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3136 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5; 3137 const PetscScalar *xb; 3138 PetscScalar x1,x2,x3,x4,x5; 3139 const MatScalar *v,*vv; 3140 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3141 PetscBool usecprow=a->compressedrow.use; 3142 3143 PetscFunctionBegin; 3144 idx = a->j; 3145 v = a->a; 3146 if (usecprow) { 3147 mbs = a->compressedrow.nrows; 3148 ii = a->compressedrow.i; 3149 ridx = a->compressedrow.rindex; 3150 } else { 3151 mbs = a->mbs; 3152 ii = a->i; 3153 z = c; 3154 } 3155 3156 for (i=0; i<mbs; i++) { 3157 n = ii[1] - ii[0]; ii++; 3158 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3159 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3160 if (usecprow) z = c + 5*ridx[i]; 3161 jj = idx; 3162 vv = v; 3163 for (k=0; k<cn; k++) { 3164 idx = jj; 3165 v = vv; 3166 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 3167 for (j=0; j<n; j++) { 3168 xb = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm]; 3169 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3170 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3171 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3172 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3173 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3174 v += 25; 3175 } 3176 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5; 3177 } 3178 if (!usecprow) z += 5; 3179 } 3180 PetscFunctionReturn(0); 3181 } 3182 3183 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C) 3184 { 3185 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3186 Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 3187 Mat_SeqDense *cd = (Mat_SeqDense*)C->data; 3188 PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda; 3189 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 3190 PetscBLASInt bbs,bcn,bbm,bcm; 3191 PetscScalar *z = NULL; 3192 PetscScalar *c,*b; 3193 const MatScalar *v; 3194 const PetscInt *idx,*ii,*ridx=NULL; 3195 PetscScalar _DZero=0.0,_DOne=1.0; 3196 PetscBool usecprow=a->compressedrow.use; 3197 PetscErrorCode ierr; 3198 3199 PetscFunctionBegin; 3200 if (!cm || !cn) PetscFunctionReturn(0); 3201 if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n); 3202 if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n); 3203 if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n); 3204 b = bd->v; 3205 if (a->nonzerorowcnt != A->rmap->n) { 3206 ierr = MatZeroEntries(C);CHKERRQ(ierr); 3207 } 3208 ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 3209 switch (bs) { 3210 case 1: 3211 ierr = MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 3212 break; 3213 case 2: 3214 ierr = MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 3215 break; 3216 case 3: 3217 ierr = MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 3218 break; 3219 case 4: 3220 ierr = MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 3221 break; 3222 case 5: 3223 ierr = MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 3224 break; 3225 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 3226 ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr); 3227 ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr); 3228 ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr); 3229 ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr); 3230 idx = a->j; 3231 v = a->a; 3232 if (usecprow) { 3233 mbs = a->compressedrow.nrows; 3234 ii = a->compressedrow.i; 3235 ridx = a->compressedrow.rindex; 3236 } else { 3237 mbs = a->mbs; 3238 ii = a->i; 3239 z = c; 3240 } 3241 for (i=0; i<mbs; i++) { 3242 n = ii[1] - ii[0]; ii++; 3243 if (usecprow) z = c + bs*ridx[i]; 3244 if (n) { 3245 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm)); 3246 v += bs2; 3247 } 3248 for (j=1; j<n; j++) { 3249 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm)); 3250 v += bs2; 3251 } 3252 if (!usecprow) z += bs; 3253 } 3254 } 3255 ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 3256 ierr = PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);CHKERRQ(ierr); 3257 PetscFunctionReturn(0); 3258 } 3259