1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/impls/dense/seq/dense.h> 4 #include <petsc/private/kernels/blockinvert.h> 5 #include <petscbt.h> 6 #include <petscblaslapack.h> 7 8 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 9 #include <immintrin.h> 10 #endif 11 12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 15 PetscErrorCode ierr; 16 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 17 const PetscInt *idx; 18 PetscInt start,end,*ai,*aj,bs,*nidx2; 19 PetscBT table; 20 21 PetscFunctionBegin; 22 m = a->mbs; 23 ai = a->i; 24 aj = a->j; 25 bs = A->rmap->bs; 26 27 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 29 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 30 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 31 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 32 33 for (i=0; i<is_max; i++) { 34 /* Initialise the two local arrays */ 35 isz = 0; 36 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 37 38 /* Extract the indices, assume there can be duplicate entries */ 39 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 40 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 41 42 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 43 for (j=0; j<n; ++j) { 44 ival = idx[j]/bs; /* convert the indices into block indices */ 45 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 46 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++) { /* for each overlap*/ 53 n = isz; 54 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 55 row = nidx[k]; 56 start = ai[row]; 57 end = ai[row+1]; 58 for (l = start; l<end; l++) { 59 val = aj[l]; 60 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 61 } 62 } 63 } 64 /* expand the Index Set */ 65 for (j=0; j<isz; j++) { 66 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 67 } 68 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 69 } 70 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 71 ierr = PetscFree(nidx);CHKERRQ(ierr); 72 ierr = PetscFree(nidx2);CHKERRQ(ierr); 73 PetscFunctionReturn(0); 74 } 75 76 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 77 { 78 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 79 PetscErrorCode ierr; 80 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 81 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 82 const PetscInt *irow,*icol; 83 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 84 PetscInt *aj = a->j,*ai = a->i; 85 MatScalar *mat_a; 86 Mat C; 87 PetscBool flag; 88 89 PetscFunctionBegin; 90 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 91 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 92 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 93 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 94 95 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 96 ssmap = smap; 97 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 98 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 99 /* determine lens of each row */ 100 for (i=0; i<nrows; i++) { 101 kstart = ai[irow[i]]; 102 kend = kstart + a->ilen[irow[i]]; 103 lens[i] = 0; 104 for (k=kstart; k<kend; k++) { 105 if (ssmap[aj[k]]) lens[i]++; 106 } 107 } 108 /* Create and fill new matrix */ 109 if (scall == MAT_REUSE_MATRIX) { 110 c = (Mat_SeqBAIJ*)((*B)->data); 111 112 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 113 ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr); 114 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 115 ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr); 116 C = *B; 117 } else { 118 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 119 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 120 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 121 ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 122 } 123 c = (Mat_SeqBAIJ*)(C->data); 124 for (i=0; i<nrows; i++) { 125 row = irow[i]; 126 kstart = ai[row]; 127 kend = kstart + a->ilen[row]; 128 mat_i = c->i[i]; 129 mat_j = c->j + mat_i; 130 mat_a = c->a + mat_i*bs2; 131 mat_ilen = c->ilen + i; 132 for (k=kstart; k<kend; k++) { 133 if ((tcol=ssmap[a->j[k]])) { 134 *mat_j++ = tcol - 1; 135 ierr = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr); 136 mat_a += bs2; 137 (*mat_ilen)++; 138 } 139 } 140 } 141 /* sort */ 142 { 143 MatScalar *work; 144 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 145 for (i=0; i<nrows; i++) { 146 PetscInt ilen; 147 mat_i = c->i[i]; 148 mat_j = c->j + mat_i; 149 mat_a = c->a + mat_i*bs2; 150 ilen = c->ilen[i]; 151 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 152 } 153 ierr = PetscFree(work);CHKERRQ(ierr); 154 } 155 156 /* Free work space */ 157 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 158 ierr = PetscFree(smap);CHKERRQ(ierr); 159 ierr = PetscFree(lens);CHKERRQ(ierr); 160 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 161 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 162 163 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 164 *B = C; 165 PetscFunctionReturn(0); 166 } 167 168 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 169 { 170 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 171 IS is1,is2; 172 PetscErrorCode ierr; 173 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 174 const PetscInt *irow,*icol; 175 176 PetscFunctionBegin; 177 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 178 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 179 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 180 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 181 182 /* Verify if the indices corespond to each element in a block 183 and form the IS with compressed IS */ 184 maxmnbs = PetscMax(a->mbs,a->nbs); 185 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 186 ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr); 187 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 188 for (i=0; i<a->mbs; i++) { 189 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 190 } 191 count = 0; 192 for (i=0; i<nrows; i++) { 193 j = irow[i] / bs; 194 if ((vary[j]--)==bs) iary[count++] = j; 195 } 196 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 197 198 ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr); 199 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 200 for (i=0; i<a->nbs; i++) { 201 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 202 } 203 count = 0; 204 for (i=0; i<ncols; i++) { 205 j = icol[i] / bs; 206 if ((vary[j]--)==bs) iary[count++] = j; 207 } 208 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 209 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 210 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 211 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 212 213 ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 214 ierr = ISDestroy(&is1);CHKERRQ(ierr); 215 ierr = ISDestroy(&is2);CHKERRQ(ierr); 216 PetscFunctionReturn(0); 217 } 218 219 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 220 { 221 PetscErrorCode ierr; 222 Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data; 223 Mat_SubSppt *submatj = c->submatis1; 224 225 PetscFunctionBegin; 226 ierr = (*submatj->destroy)(C);CHKERRQ(ierr); 227 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 228 PetscFunctionReturn(0); 229 } 230 231 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[]) 232 { 233 PetscErrorCode ierr; 234 PetscInt i; 235 Mat C; 236 Mat_SeqBAIJ *c; 237 Mat_SubSppt *submatj; 238 239 PetscFunctionBegin; 240 for (i=0; i<n; i++) { 241 C = (*mat)[i]; 242 c = (Mat_SeqBAIJ*)C->data; 243 submatj = c->submatis1; 244 if (submatj) { 245 ierr = (*submatj->destroy)(C);CHKERRQ(ierr); 246 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 247 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 248 ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr); 249 ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr); 250 ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr); 251 } else { 252 ierr = MatDestroy(&C);CHKERRQ(ierr); 253 } 254 } 255 ierr = PetscFree(*mat);CHKERRQ(ierr); 256 PetscFunctionReturn(0); 257 } 258 259 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 260 { 261 PetscErrorCode ierr; 262 PetscInt i; 263 264 PetscFunctionBegin; 265 if (scall == MAT_INITIAL_MATRIX) { 266 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 267 } 268 269 for (i=0; i<n; i++) { 270 ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 271 } 272 PetscFunctionReturn(0); 273 } 274 275 /* -------------------------------------------------------*/ 276 /* Should check that shapes of vectors and matrices match */ 277 /* -------------------------------------------------------*/ 278 279 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 280 { 281 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 282 PetscScalar *z,sum; 283 const PetscScalar *x; 284 const MatScalar *v; 285 PetscErrorCode ierr; 286 PetscInt mbs,i,n; 287 const PetscInt *idx,*ii,*ridx=NULL; 288 PetscBool usecprow=a->compressedrow.use; 289 290 PetscFunctionBegin; 291 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 292 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 293 294 if (usecprow) { 295 mbs = a->compressedrow.nrows; 296 ii = a->compressedrow.i; 297 ridx = a->compressedrow.rindex; 298 ierr = PetscArrayzero(z,a->mbs);CHKERRQ(ierr); 299 } else { 300 mbs = a->mbs; 301 ii = a->i; 302 } 303 304 for (i=0; i<mbs; i++) { 305 n = ii[1] - ii[0]; 306 v = a->a + ii[0]; 307 idx = a->j + ii[0]; 308 ii++; 309 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 310 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 311 sum = 0.0; 312 PetscSparseDensePlusDot(sum,x,v,idx,n); 313 if (usecprow) { 314 z[ridx[i]] = sum; 315 } else { 316 z[i] = sum; 317 } 318 } 319 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 320 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 321 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 322 PetscFunctionReturn(0); 323 } 324 325 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 326 { 327 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 328 PetscScalar *z = NULL,sum1,sum2,*zarray; 329 const PetscScalar *x,*xb; 330 PetscScalar x1,x2; 331 const MatScalar *v; 332 PetscErrorCode ierr; 333 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 334 PetscBool usecprow=a->compressedrow.use; 335 336 PetscFunctionBegin; 337 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 338 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 339 340 idx = a->j; 341 v = a->a; 342 if (usecprow) { 343 mbs = a->compressedrow.nrows; 344 ii = a->compressedrow.i; 345 ridx = a->compressedrow.rindex; 346 ierr = PetscArrayzero(zarray,2*a->mbs);CHKERRQ(ierr); 347 } else { 348 mbs = a->mbs; 349 ii = a->i; 350 z = zarray; 351 } 352 353 for (i=0; i<mbs; i++) { 354 n = ii[1] - ii[0]; ii++; 355 sum1 = 0.0; sum2 = 0.0; 356 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 357 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 358 for (j=0; j<n; j++) { 359 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 360 sum1 += v[0]*x1 + v[2]*x2; 361 sum2 += v[1]*x1 + v[3]*x2; 362 v += 4; 363 } 364 if (usecprow) z = zarray + 2*ridx[i]; 365 z[0] = sum1; z[1] = sum2; 366 if (!usecprow) z += 2; 367 } 368 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 369 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 370 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 371 PetscFunctionReturn(0); 372 } 373 374 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 375 { 376 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 377 PetscScalar *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray; 378 const PetscScalar *x,*xb; 379 const MatScalar *v; 380 PetscErrorCode ierr; 381 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 382 PetscBool usecprow=a->compressedrow.use; 383 384 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 385 #pragma disjoint(*v,*z,*xb) 386 #endif 387 388 PetscFunctionBegin; 389 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 390 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 391 392 idx = a->j; 393 v = a->a; 394 if (usecprow) { 395 mbs = a->compressedrow.nrows; 396 ii = a->compressedrow.i; 397 ridx = a->compressedrow.rindex; 398 ierr = PetscArrayzero(zarray,3*a->mbs);CHKERRQ(ierr); 399 } else { 400 mbs = a->mbs; 401 ii = a->i; 402 z = zarray; 403 } 404 405 for (i=0; i<mbs; i++) { 406 n = ii[1] - ii[0]; ii++; 407 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 408 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 409 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 410 for (j=0; j<n; j++) { 411 xb = x + 3*(*idx++); 412 x1 = xb[0]; 413 x2 = xb[1]; 414 x3 = xb[2]; 415 416 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 417 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 418 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 419 v += 9; 420 } 421 if (usecprow) z = zarray + 3*ridx[i]; 422 z[0] = sum1; z[1] = sum2; z[2] = sum3; 423 if (!usecprow) z += 3; 424 } 425 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 426 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 427 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 428 PetscFunctionReturn(0); 429 } 430 431 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 432 { 433 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 434 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 435 const PetscScalar *x,*xb; 436 const MatScalar *v; 437 PetscErrorCode ierr; 438 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 439 PetscBool usecprow=a->compressedrow.use; 440 441 PetscFunctionBegin; 442 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 443 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 444 445 idx = a->j; 446 v = a->a; 447 if (usecprow) { 448 mbs = a->compressedrow.nrows; 449 ii = a->compressedrow.i; 450 ridx = a->compressedrow.rindex; 451 ierr = PetscArrayzero(zarray,4*a->mbs);CHKERRQ(ierr); 452 } else { 453 mbs = a->mbs; 454 ii = a->i; 455 z = zarray; 456 } 457 458 for (i=0; i<mbs; i++) { 459 n = ii[1] - ii[0]; 460 ii++; 461 sum1 = 0.0; 462 sum2 = 0.0; 463 sum3 = 0.0; 464 sum4 = 0.0; 465 466 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 467 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 468 for (j=0; j<n; j++) { 469 xb = x + 4*(*idx++); 470 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 471 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 472 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 473 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 474 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 475 v += 16; 476 } 477 if (usecprow) z = zarray + 4*ridx[i]; 478 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 479 if (!usecprow) z += 4; 480 } 481 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 482 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 483 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 484 PetscFunctionReturn(0); 485 } 486 487 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 488 { 489 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 490 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray; 491 const PetscScalar *xb,*x; 492 const MatScalar *v; 493 PetscErrorCode ierr; 494 const PetscInt *idx,*ii,*ridx=NULL; 495 PetscInt mbs,i,j,n; 496 PetscBool usecprow=a->compressedrow.use; 497 498 PetscFunctionBegin; 499 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 500 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 501 502 idx = a->j; 503 v = a->a; 504 if (usecprow) { 505 mbs = a->compressedrow.nrows; 506 ii = a->compressedrow.i; 507 ridx = a->compressedrow.rindex; 508 ierr = PetscArrayzero(zarray,5*a->mbs);CHKERRQ(ierr); 509 } else { 510 mbs = a->mbs; 511 ii = a->i; 512 z = zarray; 513 } 514 515 for (i=0; i<mbs; i++) { 516 n = ii[1] - ii[0]; ii++; 517 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 518 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 519 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 520 for (j=0; j<n; j++) { 521 xb = x + 5*(*idx++); 522 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 523 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 524 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 525 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 526 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 527 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 528 v += 25; 529 } 530 if (usecprow) z = zarray + 5*ridx[i]; 531 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 532 if (!usecprow) z += 5; 533 } 534 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 535 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 536 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 537 PetscFunctionReturn(0); 538 } 539 540 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 541 { 542 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 543 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; 544 const PetscScalar *x,*xb; 545 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 546 const MatScalar *v; 547 PetscErrorCode ierr; 548 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 549 PetscBool usecprow=a->compressedrow.use; 550 551 PetscFunctionBegin; 552 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 553 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 554 555 idx = a->j; 556 v = a->a; 557 if (usecprow) { 558 mbs = a->compressedrow.nrows; 559 ii = a->compressedrow.i; 560 ridx = a->compressedrow.rindex; 561 ierr = PetscArrayzero(zarray,6*a->mbs);CHKERRQ(ierr); 562 } else { 563 mbs = a->mbs; 564 ii = a->i; 565 z = zarray; 566 } 567 568 for (i=0; i<mbs; i++) { 569 n = ii[1] - ii[0]; 570 ii++; 571 sum1 = 0.0; 572 sum2 = 0.0; 573 sum3 = 0.0; 574 sum4 = 0.0; 575 sum5 = 0.0; 576 sum6 = 0.0; 577 578 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 579 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 580 for (j=0; j<n; j++) { 581 xb = x + 6*(*idx++); 582 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 583 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 584 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 585 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 586 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 587 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 588 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 589 v += 36; 590 } 591 if (usecprow) z = zarray + 6*ridx[i]; 592 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 593 if (!usecprow) z += 6; 594 } 595 596 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 597 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 598 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 599 PetscFunctionReturn(0); 600 } 601 602 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 603 { 604 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 605 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 606 const PetscScalar *x,*xb; 607 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 608 const MatScalar *v; 609 PetscErrorCode ierr; 610 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 611 PetscBool usecprow=a->compressedrow.use; 612 613 PetscFunctionBegin; 614 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 615 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 616 617 idx = a->j; 618 v = a->a; 619 if (usecprow) { 620 mbs = a->compressedrow.nrows; 621 ii = a->compressedrow.i; 622 ridx = a->compressedrow.rindex; 623 ierr = PetscArrayzero(zarray,7*a->mbs);CHKERRQ(ierr); 624 } else { 625 mbs = a->mbs; 626 ii = a->i; 627 z = zarray; 628 } 629 630 for (i=0; i<mbs; i++) { 631 n = ii[1] - ii[0]; 632 ii++; 633 sum1 = 0.0; 634 sum2 = 0.0; 635 sum3 = 0.0; 636 sum4 = 0.0; 637 sum5 = 0.0; 638 sum6 = 0.0; 639 sum7 = 0.0; 640 641 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 642 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 643 for (j=0; j<n; j++) { 644 xb = x + 7*(*idx++); 645 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 646 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 647 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 648 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 649 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 650 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 651 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 652 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 653 v += 49; 654 } 655 if (usecprow) z = zarray + 7*ridx[i]; 656 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 657 if (!usecprow) z += 7; 658 } 659 660 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 661 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 662 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 663 PetscFunctionReturn(0); 664 } 665 666 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 667 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz) 668 { 669 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 670 PetscScalar *z = NULL,*work,*workt,*zarray; 671 const PetscScalar *x,*xb; 672 const MatScalar *v; 673 PetscErrorCode ierr; 674 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 675 const PetscInt *idx,*ii,*ridx=NULL; 676 PetscInt k; 677 PetscBool usecprow=a->compressedrow.use; 678 679 __m256d a0,a1,a2,a3,a4,a5; 680 __m256d w0,w1,w2,w3; 681 __m256d z0,z1,z2; 682 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 683 684 PetscFunctionBegin; 685 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 686 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 687 688 idx = a->j; 689 v = a->a; 690 if (usecprow) { 691 mbs = a->compressedrow.nrows; 692 ii = a->compressedrow.i; 693 ridx = a->compressedrow.rindex; 694 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr); 695 } else { 696 mbs = a->mbs; 697 ii = a->i; 698 z = zarray; 699 } 700 701 if (!a->mult_work) { 702 k = PetscMax(A->rmap->n,A->cmap->n); 703 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 704 } 705 706 work = a->mult_work; 707 for (i=0; i<mbs; i++) { 708 n = ii[1] - ii[0]; ii++; 709 workt = work; 710 for (j=0; j<n; j++) { 711 xb = x + bs*(*idx++); 712 for (k=0; k<bs; k++) workt[k] = xb[k]; 713 workt += bs; 714 } 715 if (usecprow) z = zarray + bs*ridx[i]; 716 717 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 718 719 for (j=0; j<n; j++) { 720 /* first column of a */ 721 w0 = _mm256_set1_pd(work[j*9 ]); 722 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 723 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 724 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 725 726 /* second column of a */ 727 w1 = _mm256_set1_pd(work[j*9+ 1]); 728 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 729 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 730 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 731 732 /* third column of a */ 733 w2 = _mm256_set1_pd(work[j*9 +2]); 734 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 735 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 736 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 737 738 /* fourth column of a */ 739 w3 = _mm256_set1_pd(work[j*9+ 3]); 740 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 741 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 742 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 743 744 /* fifth column of a */ 745 w0 = _mm256_set1_pd(work[j*9+ 4]); 746 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 747 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 748 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 749 750 /* sixth column of a */ 751 w1 = _mm256_set1_pd(work[j*9+ 5]); 752 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 753 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 754 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 755 756 /* seventh column of a */ 757 w2 = _mm256_set1_pd(work[j*9+ 6]); 758 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 759 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 760 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 761 762 /* eigth column of a */ 763 w3 = _mm256_set1_pd(work[j*9+ 7]); 764 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 765 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 766 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 767 768 /* ninth column of a */ 769 w0 = _mm256_set1_pd(work[j*9+ 8]); 770 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 771 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 772 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 773 } 774 775 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 776 777 v += n*bs2; 778 if (!usecprow) z += bs; 779 } 780 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 781 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 782 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 783 PetscFunctionReturn(0); 784 } 785 #endif 786 787 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz) 788 { 789 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 790 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 791 const PetscScalar *x,*xb; 792 PetscScalar *zarray,xv; 793 const MatScalar *v; 794 PetscErrorCode ierr; 795 const PetscInt *ii,*ij=a->j,*idx; 796 PetscInt mbs,i,j,k,n,*ridx=NULL; 797 PetscBool usecprow=a->compressedrow.use; 798 799 PetscFunctionBegin; 800 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 801 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 802 803 v = a->a; 804 if (usecprow) { 805 mbs = a->compressedrow.nrows; 806 ii = a->compressedrow.i; 807 ridx = a->compressedrow.rindex; 808 ierr = PetscArrayzero(zarray,11*a->mbs);CHKERRQ(ierr); 809 } else { 810 mbs = a->mbs; 811 ii = a->i; 812 z = zarray; 813 } 814 815 for (i=0; i<mbs; i++) { 816 n = ii[i+1] - ii[i]; 817 idx = ij + ii[i]; 818 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 819 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; 820 821 for (j=0; j<n; j++) { 822 xb = x + 11*(idx[j]); 823 824 for (k=0; k<11; k++) { 825 xv = xb[k]; 826 sum1 += v[0]*xv; 827 sum2 += v[1]*xv; 828 sum3 += v[2]*xv; 829 sum4 += v[3]*xv; 830 sum5 += v[4]*xv; 831 sum6 += v[5]*xv; 832 sum7 += v[6]*xv; 833 sum8 += v[7]*xv; 834 sum9 += v[8]*xv; 835 sum10 += v[9]*xv; 836 sum11 += v[10]*xv; 837 v += 11; 838 } 839 } 840 if (usecprow) z = zarray + 11*ridx[i]; 841 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 842 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 843 844 if (!usecprow) z += 11; 845 } 846 847 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 848 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 849 ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr); 850 PetscFunctionReturn(0); 851 } 852 853 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 854 /* Default MatMult for block size 15 */ 855 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 856 { 857 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 858 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 859 const PetscScalar *x,*xb; 860 PetscScalar *zarray,xv; 861 const MatScalar *v; 862 PetscErrorCode ierr; 863 const PetscInt *ii,*ij=a->j,*idx; 864 PetscInt mbs,i,j,k,n,*ridx=NULL; 865 PetscBool usecprow=a->compressedrow.use; 866 867 PetscFunctionBegin; 868 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 869 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 870 871 v = a->a; 872 if (usecprow) { 873 mbs = a->compressedrow.nrows; 874 ii = a->compressedrow.i; 875 ridx = a->compressedrow.rindex; 876 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 877 } else { 878 mbs = a->mbs; 879 ii = a->i; 880 z = zarray; 881 } 882 883 for (i=0; i<mbs; i++) { 884 n = ii[i+1] - ii[i]; 885 idx = ij + ii[i]; 886 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 887 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 888 889 for (j=0; j<n; j++) { 890 xb = x + 15*(idx[j]); 891 892 for (k=0; k<15; k++) { 893 xv = xb[k]; 894 sum1 += v[0]*xv; 895 sum2 += v[1]*xv; 896 sum3 += v[2]*xv; 897 sum4 += v[3]*xv; 898 sum5 += v[4]*xv; 899 sum6 += v[5]*xv; 900 sum7 += v[6]*xv; 901 sum8 += v[7]*xv; 902 sum9 += v[8]*xv; 903 sum10 += v[9]*xv; 904 sum11 += v[10]*xv; 905 sum12 += v[11]*xv; 906 sum13 += v[12]*xv; 907 sum14 += v[13]*xv; 908 sum15 += v[14]*xv; 909 v += 15; 910 } 911 } 912 if (usecprow) z = zarray + 15*ridx[i]; 913 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 914 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 915 916 if (!usecprow) z += 15; 917 } 918 919 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 920 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 921 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 922 PetscFunctionReturn(0); 923 } 924 925 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 926 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 927 { 928 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 929 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 930 const PetscScalar *x,*xb; 931 PetscScalar x1,x2,x3,x4,*zarray; 932 const MatScalar *v; 933 PetscErrorCode ierr; 934 const PetscInt *ii,*ij=a->j,*idx; 935 PetscInt mbs,i,j,n,*ridx=NULL; 936 PetscBool usecprow=a->compressedrow.use; 937 938 PetscFunctionBegin; 939 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 940 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 941 942 v = a->a; 943 if (usecprow) { 944 mbs = a->compressedrow.nrows; 945 ii = a->compressedrow.i; 946 ridx = a->compressedrow.rindex; 947 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 948 } else { 949 mbs = a->mbs; 950 ii = a->i; 951 z = zarray; 952 } 953 954 for (i=0; i<mbs; i++) { 955 n = ii[i+1] - ii[i]; 956 idx = ij + ii[i]; 957 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 958 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 959 960 for (j=0; j<n; j++) { 961 xb = x + 15*(idx[j]); 962 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 963 964 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 965 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 966 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 967 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 968 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 969 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 970 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 971 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 972 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 973 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 974 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 975 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 976 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 977 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 978 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 979 980 v += 60; 981 982 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 983 984 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 985 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 986 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 987 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 988 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 989 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 990 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 991 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 992 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 993 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 994 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 995 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 996 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 997 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 998 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 999 v += 60; 1000 1001 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1002 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1003 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1004 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1005 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1006 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1007 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1008 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1009 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1010 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1011 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1012 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1013 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1014 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1015 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1016 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1017 v += 60; 1018 1019 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1020 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1021 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1022 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1023 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1024 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1025 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1026 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1027 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1028 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1029 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1030 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1031 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1032 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1033 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1034 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1035 v += 45; 1036 } 1037 if (usecprow) z = zarray + 15*ridx[i]; 1038 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1039 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1040 1041 if (!usecprow) z += 15; 1042 } 1043 1044 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1045 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1046 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1047 PetscFunctionReturn(0); 1048 } 1049 1050 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1051 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1052 { 1053 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1054 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1055 const PetscScalar *x,*xb; 1056 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1057 const MatScalar *v; 1058 PetscErrorCode ierr; 1059 const PetscInt *ii,*ij=a->j,*idx; 1060 PetscInt mbs,i,j,n,*ridx=NULL; 1061 PetscBool usecprow=a->compressedrow.use; 1062 1063 PetscFunctionBegin; 1064 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1065 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1066 1067 v = a->a; 1068 if (usecprow) { 1069 mbs = a->compressedrow.nrows; 1070 ii = a->compressedrow.i; 1071 ridx = a->compressedrow.rindex; 1072 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1073 } else { 1074 mbs = a->mbs; 1075 ii = a->i; 1076 z = zarray; 1077 } 1078 1079 for (i=0; i<mbs; i++) { 1080 n = ii[i+1] - ii[i]; 1081 idx = ij + ii[i]; 1082 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1083 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1084 1085 for (j=0; j<n; j++) { 1086 xb = x + 15*(idx[j]); 1087 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1088 x8 = xb[7]; 1089 1090 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1091 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1092 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1093 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1094 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1095 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1096 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1097 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1098 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1099 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1100 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1101 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1102 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1103 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1104 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1105 v += 120; 1106 1107 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1108 1109 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1110 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1111 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1112 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1113 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1114 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1115 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1116 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1117 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1118 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1119 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1120 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1121 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1122 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1123 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1124 v += 105; 1125 } 1126 if (usecprow) z = zarray + 15*ridx[i]; 1127 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1128 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1129 1130 if (!usecprow) z += 15; 1131 } 1132 1133 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1134 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1135 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1136 PetscFunctionReturn(0); 1137 } 1138 1139 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1140 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1141 { 1142 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1143 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1144 const PetscScalar *x,*xb; 1145 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1146 const MatScalar *v; 1147 PetscErrorCode ierr; 1148 const PetscInt *ii,*ij=a->j,*idx; 1149 PetscInt mbs,i,j,n,*ridx=NULL; 1150 PetscBool usecprow=a->compressedrow.use; 1151 1152 PetscFunctionBegin; 1153 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1154 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1155 1156 v = a->a; 1157 if (usecprow) { 1158 mbs = a->compressedrow.nrows; 1159 ii = a->compressedrow.i; 1160 ridx = a->compressedrow.rindex; 1161 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1162 } else { 1163 mbs = a->mbs; 1164 ii = a->i; 1165 z = zarray; 1166 } 1167 1168 for (i=0; i<mbs; i++) { 1169 n = ii[i+1] - ii[i]; 1170 idx = ij + ii[i]; 1171 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1172 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1173 1174 for (j=0; j<n; j++) { 1175 xb = x + 15*(idx[j]); 1176 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1177 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1178 1179 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1180 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1181 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1182 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1183 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1184 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1185 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1186 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1187 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1188 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1189 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1190 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1191 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1192 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1193 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1194 v += 225; 1195 } 1196 if (usecprow) z = zarray + 15*ridx[i]; 1197 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1198 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1199 1200 if (!usecprow) z += 15; 1201 } 1202 1203 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1204 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1205 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1206 PetscFunctionReturn(0); 1207 } 1208 1209 /* 1210 This will not work with MatScalar == float because it calls the BLAS 1211 */ 1212 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1213 { 1214 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1215 PetscScalar *z = NULL,*work,*workt,*zarray; 1216 const PetscScalar *x,*xb; 1217 const MatScalar *v; 1218 PetscErrorCode ierr; 1219 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1220 const PetscInt *idx,*ii,*ridx=NULL; 1221 PetscInt ncols,k; 1222 PetscBool usecprow=a->compressedrow.use; 1223 1224 PetscFunctionBegin; 1225 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1226 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1227 1228 idx = a->j; 1229 v = a->a; 1230 if (usecprow) { 1231 mbs = a->compressedrow.nrows; 1232 ii = a->compressedrow.i; 1233 ridx = a->compressedrow.rindex; 1234 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr); 1235 } else { 1236 mbs = a->mbs; 1237 ii = a->i; 1238 z = zarray; 1239 } 1240 1241 if (!a->mult_work) { 1242 k = PetscMax(A->rmap->n,A->cmap->n); 1243 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1244 } 1245 work = a->mult_work; 1246 for (i=0; i<mbs; i++) { 1247 n = ii[1] - ii[0]; ii++; 1248 ncols = n*bs; 1249 workt = work; 1250 for (j=0; j<n; j++) { 1251 xb = x + bs*(*idx++); 1252 for (k=0; k<bs; k++) workt[k] = xb[k]; 1253 workt += bs; 1254 } 1255 if (usecprow) z = zarray + bs*ridx[i]; 1256 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1257 v += n*bs2; 1258 if (!usecprow) z += bs; 1259 } 1260 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1261 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1262 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1263 PetscFunctionReturn(0); 1264 } 1265 1266 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1267 { 1268 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1269 const PetscScalar *x; 1270 PetscScalar *y,*z,sum; 1271 const MatScalar *v; 1272 PetscErrorCode ierr; 1273 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1274 const PetscInt *idx,*ii; 1275 PetscBool usecprow=a->compressedrow.use; 1276 1277 PetscFunctionBegin; 1278 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1279 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1280 1281 idx = a->j; 1282 v = a->a; 1283 if (usecprow) { 1284 if (zz != yy) { 1285 ierr = PetscArraycpy(z,y,mbs);CHKERRQ(ierr); 1286 } 1287 mbs = a->compressedrow.nrows; 1288 ii = a->compressedrow.i; 1289 ridx = a->compressedrow.rindex; 1290 } else { 1291 ii = a->i; 1292 } 1293 1294 for (i=0; i<mbs; i++) { 1295 n = ii[1] - ii[0]; 1296 ii++; 1297 if (!usecprow) { 1298 sum = y[i]; 1299 } else { 1300 sum = y[ridx[i]]; 1301 } 1302 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1303 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1304 PetscSparseDensePlusDot(sum,x,v,idx,n); 1305 v += n; 1306 idx += n; 1307 if (usecprow) { 1308 z[ridx[i]] = sum; 1309 } else { 1310 z[i] = sum; 1311 } 1312 } 1313 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1314 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1315 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1316 PetscFunctionReturn(0); 1317 } 1318 1319 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1320 { 1321 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1322 PetscScalar *y = NULL,*z = NULL,sum1,sum2; 1323 const PetscScalar *x,*xb; 1324 PetscScalar x1,x2,*yarray,*zarray; 1325 const MatScalar *v; 1326 PetscErrorCode ierr; 1327 PetscInt mbs = a->mbs,i,n,j; 1328 const PetscInt *idx,*ii,*ridx = NULL; 1329 PetscBool usecprow = a->compressedrow.use; 1330 1331 PetscFunctionBegin; 1332 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1333 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1334 1335 idx = a->j; 1336 v = a->a; 1337 if (usecprow) { 1338 if (zz != yy) { 1339 ierr = PetscArraycpy(zarray,yarray,2*mbs);CHKERRQ(ierr); 1340 } 1341 mbs = a->compressedrow.nrows; 1342 ii = a->compressedrow.i; 1343 ridx = a->compressedrow.rindex; 1344 } else { 1345 ii = a->i; 1346 y = yarray; 1347 z = zarray; 1348 } 1349 1350 for (i=0; i<mbs; i++) { 1351 n = ii[1] - ii[0]; ii++; 1352 if (usecprow) { 1353 z = zarray + 2*ridx[i]; 1354 y = yarray + 2*ridx[i]; 1355 } 1356 sum1 = y[0]; sum2 = y[1]; 1357 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1358 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1359 for (j=0; j<n; j++) { 1360 xb = x + 2*(*idx++); 1361 x1 = xb[0]; 1362 x2 = xb[1]; 1363 1364 sum1 += v[0]*x1 + v[2]*x2; 1365 sum2 += v[1]*x1 + v[3]*x2; 1366 v += 4; 1367 } 1368 z[0] = sum1; z[1] = sum2; 1369 if (!usecprow) { 1370 z += 2; y += 2; 1371 } 1372 } 1373 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1374 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1375 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1376 PetscFunctionReturn(0); 1377 } 1378 1379 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1380 { 1381 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1382 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1383 const PetscScalar *x,*xb; 1384 const MatScalar *v; 1385 PetscErrorCode ierr; 1386 PetscInt mbs = a->mbs,i,j,n; 1387 const PetscInt *idx,*ii,*ridx = NULL; 1388 PetscBool usecprow = a->compressedrow.use; 1389 1390 PetscFunctionBegin; 1391 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1392 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1393 1394 idx = a->j; 1395 v = a->a; 1396 if (usecprow) { 1397 if (zz != yy) { 1398 ierr = PetscArraycpy(zarray,yarray,3*mbs);CHKERRQ(ierr); 1399 } 1400 mbs = a->compressedrow.nrows; 1401 ii = a->compressedrow.i; 1402 ridx = a->compressedrow.rindex; 1403 } else { 1404 ii = a->i; 1405 y = yarray; 1406 z = zarray; 1407 } 1408 1409 for (i=0; i<mbs; i++) { 1410 n = ii[1] - ii[0]; ii++; 1411 if (usecprow) { 1412 z = zarray + 3*ridx[i]; 1413 y = yarray + 3*ridx[i]; 1414 } 1415 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1416 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1417 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1418 for (j=0; j<n; j++) { 1419 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1420 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1421 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1422 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1423 v += 9; 1424 } 1425 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1426 if (!usecprow) { 1427 z += 3; y += 3; 1428 } 1429 } 1430 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1431 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1432 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1433 PetscFunctionReturn(0); 1434 } 1435 1436 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1437 { 1438 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1439 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1440 const PetscScalar *x,*xb; 1441 const MatScalar *v; 1442 PetscErrorCode ierr; 1443 PetscInt mbs = a->mbs,i,j,n; 1444 const PetscInt *idx,*ii,*ridx=NULL; 1445 PetscBool usecprow=a->compressedrow.use; 1446 1447 PetscFunctionBegin; 1448 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1449 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1450 1451 idx = a->j; 1452 v = a->a; 1453 if (usecprow) { 1454 if (zz != yy) { 1455 ierr = PetscArraycpy(zarray,yarray,4*mbs);CHKERRQ(ierr); 1456 } 1457 mbs = a->compressedrow.nrows; 1458 ii = a->compressedrow.i; 1459 ridx = a->compressedrow.rindex; 1460 } else { 1461 ii = a->i; 1462 y = yarray; 1463 z = zarray; 1464 } 1465 1466 for (i=0; i<mbs; i++) { 1467 n = ii[1] - ii[0]; ii++; 1468 if (usecprow) { 1469 z = zarray + 4*ridx[i]; 1470 y = yarray + 4*ridx[i]; 1471 } 1472 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1473 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1474 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1475 for (j=0; j<n; j++) { 1476 xb = x + 4*(*idx++); 1477 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1478 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1479 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1480 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1481 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1482 v += 16; 1483 } 1484 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1485 if (!usecprow) { 1486 z += 4; y += 4; 1487 } 1488 } 1489 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1490 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1491 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1492 PetscFunctionReturn(0); 1493 } 1494 1495 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1496 { 1497 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1498 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1499 const PetscScalar *x,*xb; 1500 PetscScalar *yarray,*zarray; 1501 const MatScalar *v; 1502 PetscErrorCode ierr; 1503 PetscInt mbs = a->mbs,i,j,n; 1504 const PetscInt *idx,*ii,*ridx = NULL; 1505 PetscBool usecprow=a->compressedrow.use; 1506 1507 PetscFunctionBegin; 1508 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1509 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1510 1511 idx = a->j; 1512 v = a->a; 1513 if (usecprow) { 1514 if (zz != yy) { 1515 ierr = PetscArraycpy(zarray,yarray,5*mbs);CHKERRQ(ierr); 1516 } 1517 mbs = a->compressedrow.nrows; 1518 ii = a->compressedrow.i; 1519 ridx = a->compressedrow.rindex; 1520 } else { 1521 ii = a->i; 1522 y = yarray; 1523 z = zarray; 1524 } 1525 1526 for (i=0; i<mbs; i++) { 1527 n = ii[1] - ii[0]; ii++; 1528 if (usecprow) { 1529 z = zarray + 5*ridx[i]; 1530 y = yarray + 5*ridx[i]; 1531 } 1532 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1533 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1534 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1535 for (j=0; j<n; j++) { 1536 xb = x + 5*(*idx++); 1537 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1538 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1539 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1540 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1541 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1542 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1543 v += 25; 1544 } 1545 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1546 if (!usecprow) { 1547 z += 5; y += 5; 1548 } 1549 } 1550 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1551 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1552 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1553 PetscFunctionReturn(0); 1554 } 1555 1556 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1557 { 1558 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1559 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; 1560 const PetscScalar *x,*xb; 1561 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1562 const MatScalar *v; 1563 PetscErrorCode ierr; 1564 PetscInt mbs = a->mbs,i,j,n; 1565 const PetscInt *idx,*ii,*ridx=NULL; 1566 PetscBool usecprow=a->compressedrow.use; 1567 1568 PetscFunctionBegin; 1569 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1570 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1571 1572 idx = a->j; 1573 v = a->a; 1574 if (usecprow) { 1575 if (zz != yy) { 1576 ierr = PetscArraycpy(zarray,yarray,6*mbs);CHKERRQ(ierr); 1577 } 1578 mbs = a->compressedrow.nrows; 1579 ii = a->compressedrow.i; 1580 ridx = a->compressedrow.rindex; 1581 } else { 1582 ii = a->i; 1583 y = yarray; 1584 z = zarray; 1585 } 1586 1587 for (i=0; i<mbs; i++) { 1588 n = ii[1] - ii[0]; ii++; 1589 if (usecprow) { 1590 z = zarray + 6*ridx[i]; 1591 y = yarray + 6*ridx[i]; 1592 } 1593 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1594 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1595 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1596 for (j=0; j<n; j++) { 1597 xb = x + 6*(*idx++); 1598 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1599 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1600 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1601 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1602 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1603 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1604 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1605 v += 36; 1606 } 1607 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1608 if (!usecprow) { 1609 z += 6; y += 6; 1610 } 1611 } 1612 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1613 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1614 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1615 PetscFunctionReturn(0); 1616 } 1617 1618 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1619 { 1620 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1621 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1622 const PetscScalar *x,*xb; 1623 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1624 const MatScalar *v; 1625 PetscErrorCode ierr; 1626 PetscInt mbs = a->mbs,i,j,n; 1627 const PetscInt *idx,*ii,*ridx = NULL; 1628 PetscBool usecprow=a->compressedrow.use; 1629 1630 PetscFunctionBegin; 1631 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1632 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1633 1634 idx = a->j; 1635 v = a->a; 1636 if (usecprow) { 1637 if (zz != yy) { 1638 ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr); 1639 } 1640 mbs = a->compressedrow.nrows; 1641 ii = a->compressedrow.i; 1642 ridx = a->compressedrow.rindex; 1643 } else { 1644 ii = a->i; 1645 y = yarray; 1646 z = zarray; 1647 } 1648 1649 for (i=0; i<mbs; i++) { 1650 n = ii[1] - ii[0]; ii++; 1651 if (usecprow) { 1652 z = zarray + 7*ridx[i]; 1653 y = yarray + 7*ridx[i]; 1654 } 1655 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1656 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1657 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1658 for (j=0; j<n; j++) { 1659 xb = x + 7*(*idx++); 1660 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1661 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1662 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1663 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1664 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1665 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1666 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1667 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1668 v += 49; 1669 } 1670 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1671 if (!usecprow) { 1672 z += 7; y += 7; 1673 } 1674 } 1675 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1676 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1677 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1678 PetscFunctionReturn(0); 1679 } 1680 1681 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1682 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz) 1683 { 1684 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1685 PetscScalar *z = NULL,*work,*workt,*zarray; 1686 const PetscScalar *x,*xb; 1687 const MatScalar *v; 1688 PetscErrorCode ierr; 1689 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1690 PetscInt k; 1691 PetscBool usecprow=a->compressedrow.use; 1692 const PetscInt *idx,*ii,*ridx=NULL; 1693 1694 __m256d a0,a1,a2,a3,a4,a5; 1695 __m256d w0,w1,w2,w3; 1696 __m256d z0,z1,z2; 1697 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 1698 1699 PetscFunctionBegin; 1700 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1701 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1702 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1703 1704 idx = a->j; 1705 v = a->a; 1706 if (usecprow) { 1707 mbs = a->compressedrow.nrows; 1708 ii = a->compressedrow.i; 1709 ridx = a->compressedrow.rindex; 1710 } else { 1711 mbs = a->mbs; 1712 ii = a->i; 1713 z = zarray; 1714 } 1715 1716 if (!a->mult_work) { 1717 k = PetscMax(A->rmap->n,A->cmap->n); 1718 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1719 } 1720 1721 work = a->mult_work; 1722 for (i=0; i<mbs; i++) { 1723 n = ii[1] - ii[0]; ii++; 1724 workt = work; 1725 for (j=0; j<n; j++) { 1726 xb = x + bs*(*idx++); 1727 for (k=0; k<bs; k++) workt[k] = xb[k]; 1728 workt += bs; 1729 } 1730 if (usecprow) z = zarray + bs*ridx[i]; 1731 1732 z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]); 1733 1734 for (j=0; j<n; j++) { 1735 /* first column of a */ 1736 w0 = _mm256_set1_pd(work[j*9 ]); 1737 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 1738 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 1739 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 1740 1741 /* second column of a */ 1742 w1 = _mm256_set1_pd(work[j*9+ 1]); 1743 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 1744 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 1745 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 1746 1747 /* third column of a */ 1748 w2 = _mm256_set1_pd(work[j*9+ 2]); 1749 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 1750 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 1751 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 1752 1753 /* fourth column of a */ 1754 w3 = _mm256_set1_pd(work[j*9+ 3]); 1755 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 1756 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 1757 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 1758 1759 /* fifth column of a */ 1760 w0 = _mm256_set1_pd(work[j*9+ 4]); 1761 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 1762 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 1763 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 1764 1765 /* sixth column of a */ 1766 w1 = _mm256_set1_pd(work[j*9+ 5]); 1767 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 1768 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 1769 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 1770 1771 /* seventh column of a */ 1772 w2 = _mm256_set1_pd(work[j*9+ 6]); 1773 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 1774 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 1775 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 1776 1777 /* eigth column of a */ 1778 w3 = _mm256_set1_pd(work[j*9+ 7]); 1779 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 1780 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 1781 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 1782 1783 /* ninth column of a */ 1784 w0 = _mm256_set1_pd(work[j*9+ 8]); 1785 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 1786 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 1787 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 1788 } 1789 1790 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 1791 1792 v += n*bs2; 1793 if (!usecprow) z += bs; 1794 } 1795 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1796 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1797 ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr); 1798 PetscFunctionReturn(0); 1799 } 1800 #endif 1801 1802 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz) 1803 { 1804 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1805 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 1806 const PetscScalar *x,*xb; 1807 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray; 1808 const MatScalar *v; 1809 PetscErrorCode ierr; 1810 PetscInt mbs = a->mbs,i,j,n; 1811 const PetscInt *idx,*ii,*ridx = NULL; 1812 PetscBool usecprow=a->compressedrow.use; 1813 1814 PetscFunctionBegin; 1815 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1816 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1817 1818 idx = a->j; 1819 v = a->a; 1820 if (usecprow) { 1821 if (zz != yy) { 1822 ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr); 1823 } 1824 mbs = a->compressedrow.nrows; 1825 ii = a->compressedrow.i; 1826 ridx = a->compressedrow.rindex; 1827 } else { 1828 ii = a->i; 1829 y = yarray; 1830 z = zarray; 1831 } 1832 1833 for (i=0; i<mbs; i++) { 1834 n = ii[1] - ii[0]; ii++; 1835 if (usecprow) { 1836 z = zarray + 11*ridx[i]; 1837 y = yarray + 11*ridx[i]; 1838 } 1839 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1840 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; 1841 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1842 PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1843 for (j=0; j<n; j++) { 1844 xb = x + 11*(*idx++); 1845 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; 1846 sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11; 1847 sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11; 1848 sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11; 1849 sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11; 1850 sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11; 1851 sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11; 1852 sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11; 1853 sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11; 1854 sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11; 1855 sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11; 1856 sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11; 1857 v += 121; 1858 } 1859 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1860 z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10; 1861 if (!usecprow) { 1862 z += 11; y += 11; 1863 } 1864 } 1865 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1866 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1867 ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr); 1868 PetscFunctionReturn(0); 1869 } 1870 1871 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1872 { 1873 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1874 PetscScalar *z = NULL,*work,*workt,*zarray; 1875 const PetscScalar *x,*xb; 1876 const MatScalar *v; 1877 PetscErrorCode ierr; 1878 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1879 PetscInt ncols,k; 1880 const PetscInt *ridx = NULL,*idx,*ii; 1881 PetscBool usecprow = a->compressedrow.use; 1882 1883 PetscFunctionBegin; 1884 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1885 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1886 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1887 1888 idx = a->j; 1889 v = a->a; 1890 if (usecprow) { 1891 mbs = a->compressedrow.nrows; 1892 ii = a->compressedrow.i; 1893 ridx = a->compressedrow.rindex; 1894 } else { 1895 mbs = a->mbs; 1896 ii = a->i; 1897 z = zarray; 1898 } 1899 1900 if (!a->mult_work) { 1901 k = PetscMax(A->rmap->n,A->cmap->n); 1902 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1903 } 1904 work = a->mult_work; 1905 for (i=0; i<mbs; i++) { 1906 n = ii[1] - ii[0]; ii++; 1907 ncols = n*bs; 1908 workt = work; 1909 for (j=0; j<n; j++) { 1910 xb = x + bs*(*idx++); 1911 for (k=0; k<bs; k++) workt[k] = xb[k]; 1912 workt += bs; 1913 } 1914 if (usecprow) z = zarray + bs*ridx[i]; 1915 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1916 v += n*bs2; 1917 if (!usecprow) z += bs; 1918 } 1919 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1920 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1921 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1922 PetscFunctionReturn(0); 1923 } 1924 1925 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1926 { 1927 PetscScalar zero = 0.0; 1928 PetscErrorCode ierr; 1929 1930 PetscFunctionBegin; 1931 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1932 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1933 PetscFunctionReturn(0); 1934 } 1935 1936 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1937 { 1938 PetscScalar zero = 0.0; 1939 PetscErrorCode ierr; 1940 1941 PetscFunctionBegin; 1942 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1943 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1944 PetscFunctionReturn(0); 1945 } 1946 1947 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1948 { 1949 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1950 PetscScalar *z,x1,x2,x3,x4,x5; 1951 const PetscScalar *x,*xb = NULL; 1952 const MatScalar *v; 1953 PetscErrorCode ierr; 1954 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 1955 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1956 Mat_CompressedRow cprow = a->compressedrow; 1957 PetscBool usecprow = cprow.use; 1958 1959 PetscFunctionBegin; 1960 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1961 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1962 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1963 1964 idx = a->j; 1965 v = a->a; 1966 if (usecprow) { 1967 mbs = cprow.nrows; 1968 ii = cprow.i; 1969 ridx = cprow.rindex; 1970 } else { 1971 mbs=a->mbs; 1972 ii = a->i; 1973 xb = x; 1974 } 1975 1976 switch (bs) { 1977 case 1: 1978 for (i=0; i<mbs; i++) { 1979 if (usecprow) xb = x + ridx[i]; 1980 x1 = xb[0]; 1981 ib = idx + ii[0]; 1982 n = ii[1] - ii[0]; ii++; 1983 for (j=0; j<n; j++) { 1984 rval = ib[j]; 1985 z[rval] += PetscConj(*v) * x1; 1986 v++; 1987 } 1988 if (!usecprow) xb++; 1989 } 1990 break; 1991 case 2: 1992 for (i=0; i<mbs; i++) { 1993 if (usecprow) xb = x + 2*ridx[i]; 1994 x1 = xb[0]; x2 = xb[1]; 1995 ib = idx + ii[0]; 1996 n = ii[1] - ii[0]; ii++; 1997 for (j=0; j<n; j++) { 1998 rval = ib[j]*2; 1999 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2000 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2001 v += 4; 2002 } 2003 if (!usecprow) xb += 2; 2004 } 2005 break; 2006 case 3: 2007 for (i=0; i<mbs; i++) { 2008 if (usecprow) xb = x + 3*ridx[i]; 2009 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2010 ib = idx + ii[0]; 2011 n = ii[1] - ii[0]; ii++; 2012 for (j=0; j<n; j++) { 2013 rval = ib[j]*3; 2014 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2015 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2016 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2017 v += 9; 2018 } 2019 if (!usecprow) xb += 3; 2020 } 2021 break; 2022 case 4: 2023 for (i=0; i<mbs; i++) { 2024 if (usecprow) xb = x + 4*ridx[i]; 2025 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2026 ib = idx + ii[0]; 2027 n = ii[1] - ii[0]; ii++; 2028 for (j=0; j<n; j++) { 2029 rval = ib[j]*4; 2030 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2031 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2032 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2033 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2034 v += 16; 2035 } 2036 if (!usecprow) xb += 4; 2037 } 2038 break; 2039 case 5: 2040 for (i=0; i<mbs; i++) { 2041 if (usecprow) xb = x + 5*ridx[i]; 2042 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2043 x4 = xb[3]; x5 = xb[4]; 2044 ib = idx + ii[0]; 2045 n = ii[1] - ii[0]; ii++; 2046 for (j=0; j<n; j++) { 2047 rval = ib[j]*5; 2048 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2049 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2050 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2051 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2052 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2053 v += 25; 2054 } 2055 if (!usecprow) xb += 5; 2056 } 2057 break; 2058 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2059 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2060 #if 0 2061 { 2062 PetscInt ncols,k,bs2=a->bs2; 2063 PetscScalar *work,*workt,zb; 2064 const PetscScalar *xtmp; 2065 if (!a->mult_work) { 2066 k = PetscMax(A->rmap->n,A->cmap->n); 2067 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2068 } 2069 work = a->mult_work; 2070 xtmp = x; 2071 for (i=0; i<mbs; i++) { 2072 n = ii[1] - ii[0]; ii++; 2073 ncols = n*bs; 2074 ierr = PetscArrayzero(work,ncols);CHKERRQ(ierr); 2075 if (usecprow) xtmp = x + bs*ridx[i]; 2076 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2077 v += n*bs2; 2078 if (!usecprow) xtmp += bs; 2079 workt = work; 2080 for (j=0; j<n; j++) { 2081 zb = z + bs*(*idx++); 2082 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2083 workt += bs; 2084 } 2085 } 2086 } 2087 #endif 2088 } 2089 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2090 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2091 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2092 PetscFunctionReturn(0); 2093 } 2094 2095 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2096 { 2097 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2098 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2099 const PetscScalar *x,*xb = NULL; 2100 const MatScalar *v; 2101 PetscErrorCode ierr; 2102 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2103 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2104 Mat_CompressedRow cprow = a->compressedrow; 2105 PetscBool usecprow=cprow.use; 2106 2107 PetscFunctionBegin; 2108 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2109 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2110 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2111 2112 idx = a->j; 2113 v = a->a; 2114 if (usecprow) { 2115 mbs = cprow.nrows; 2116 ii = cprow.i; 2117 ridx = cprow.rindex; 2118 } else { 2119 mbs=a->mbs; 2120 ii = a->i; 2121 xb = x; 2122 } 2123 2124 switch (bs) { 2125 case 1: 2126 for (i=0; i<mbs; i++) { 2127 if (usecprow) xb = x + ridx[i]; 2128 x1 = xb[0]; 2129 ib = idx + ii[0]; 2130 n = ii[1] - ii[0]; ii++; 2131 for (j=0; j<n; j++) { 2132 rval = ib[j]; 2133 z[rval] += *v * x1; 2134 v++; 2135 } 2136 if (!usecprow) xb++; 2137 } 2138 break; 2139 case 2: 2140 for (i=0; i<mbs; i++) { 2141 if (usecprow) xb = x + 2*ridx[i]; 2142 x1 = xb[0]; x2 = xb[1]; 2143 ib = idx + ii[0]; 2144 n = ii[1] - ii[0]; ii++; 2145 for (j=0; j<n; j++) { 2146 rval = ib[j]*2; 2147 z[rval++] += v[0]*x1 + v[1]*x2; 2148 z[rval++] += v[2]*x1 + v[3]*x2; 2149 v += 4; 2150 } 2151 if (!usecprow) xb += 2; 2152 } 2153 break; 2154 case 3: 2155 for (i=0; i<mbs; i++) { 2156 if (usecprow) xb = x + 3*ridx[i]; 2157 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2158 ib = idx + ii[0]; 2159 n = ii[1] - ii[0]; ii++; 2160 for (j=0; j<n; j++) { 2161 rval = ib[j]*3; 2162 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2163 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2164 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2165 v += 9; 2166 } 2167 if (!usecprow) xb += 3; 2168 } 2169 break; 2170 case 4: 2171 for (i=0; i<mbs; i++) { 2172 if (usecprow) xb = x + 4*ridx[i]; 2173 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2174 ib = idx + ii[0]; 2175 n = ii[1] - ii[0]; ii++; 2176 for (j=0; j<n; j++) { 2177 rval = ib[j]*4; 2178 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2179 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2180 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2181 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2182 v += 16; 2183 } 2184 if (!usecprow) xb += 4; 2185 } 2186 break; 2187 case 5: 2188 for (i=0; i<mbs; i++) { 2189 if (usecprow) xb = x + 5*ridx[i]; 2190 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2191 x4 = xb[3]; x5 = xb[4]; 2192 ib = idx + ii[0]; 2193 n = ii[1] - ii[0]; ii++; 2194 for (j=0; j<n; j++) { 2195 rval = ib[j]*5; 2196 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2197 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2198 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2199 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2200 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2201 v += 25; 2202 } 2203 if (!usecprow) xb += 5; 2204 } 2205 break; 2206 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2207 PetscInt ncols,k; 2208 PetscScalar *work,*workt; 2209 const PetscScalar *xtmp; 2210 if (!a->mult_work) { 2211 k = PetscMax(A->rmap->n,A->cmap->n); 2212 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2213 } 2214 work = a->mult_work; 2215 xtmp = x; 2216 for (i=0; i<mbs; i++) { 2217 n = ii[1] - ii[0]; ii++; 2218 ncols = n*bs; 2219 ierr = PetscArrayzero(work,ncols);CHKERRQ(ierr); 2220 if (usecprow) xtmp = x + bs*ridx[i]; 2221 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2222 v += n*bs2; 2223 if (!usecprow) xtmp += bs; 2224 workt = work; 2225 for (j=0; j<n; j++) { 2226 zb = z + bs*(*idx++); 2227 for (k=0; k<bs; k++) zb[k] += workt[k]; 2228 workt += bs; 2229 } 2230 } 2231 } 2232 } 2233 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2234 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2235 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2236 PetscFunctionReturn(0); 2237 } 2238 2239 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2240 { 2241 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2242 PetscInt totalnz = a->bs2*a->nz; 2243 PetscScalar oalpha = alpha; 2244 PetscErrorCode ierr; 2245 PetscBLASInt one = 1,tnz; 2246 2247 PetscFunctionBegin; 2248 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 2249 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2250 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 2251 PetscFunctionReturn(0); 2252 } 2253 2254 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2255 { 2256 PetscErrorCode ierr; 2257 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2258 MatScalar *v = a->a; 2259 PetscReal sum = 0.0; 2260 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2261 2262 PetscFunctionBegin; 2263 if (type == NORM_FROBENIUS) { 2264 #if defined(PETSC_USE_REAL___FP16) 2265 PetscBLASInt one = 1,cnt = bs2*nz; 2266 PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one)); 2267 #else 2268 for (i=0; i<bs2*nz; i++) { 2269 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2270 } 2271 #endif 2272 *norm = PetscSqrtReal(sum); 2273 ierr = PetscLogFlops(2.0*bs2*nz);CHKERRQ(ierr); 2274 } else if (type == NORM_1) { /* maximum column sum */ 2275 PetscReal *tmp; 2276 PetscInt *bcol = a->j; 2277 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 2278 for (i=0; i<nz; i++) { 2279 for (j=0; j<bs; j++) { 2280 k1 = bs*(*bcol) + j; /* column index */ 2281 for (k=0; k<bs; k++) { 2282 tmp[k1] += PetscAbsScalar(*v); v++; 2283 } 2284 } 2285 bcol++; 2286 } 2287 *norm = 0.0; 2288 for (j=0; j<A->cmap->n; j++) { 2289 if (tmp[j] > *norm) *norm = tmp[j]; 2290 } 2291 ierr = PetscFree(tmp);CHKERRQ(ierr); 2292 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2293 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2294 *norm = 0.0; 2295 for (k=0; k<bs; k++) { 2296 for (j=0; j<a->mbs; j++) { 2297 v = a->a + bs2*a->i[j] + k; 2298 sum = 0.0; 2299 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2300 for (k1=0; k1<bs; k1++) { 2301 sum += PetscAbsScalar(*v); 2302 v += bs; 2303 } 2304 } 2305 if (sum > *norm) *norm = sum; 2306 } 2307 } 2308 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2309 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2310 PetscFunctionReturn(0); 2311 } 2312 2313 2314 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2315 { 2316 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2317 PetscErrorCode ierr; 2318 2319 PetscFunctionBegin; 2320 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2321 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2322 *flg = PETSC_FALSE; 2323 PetscFunctionReturn(0); 2324 } 2325 2326 /* if the a->i are the same */ 2327 ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr); 2328 if (!*flg) PetscFunctionReturn(0); 2329 2330 /* if a->j are the same */ 2331 ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr); 2332 if (!*flg) PetscFunctionReturn(0); 2333 2334 /* if a->a are the same */ 2335 ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);CHKERRQ(ierr); 2336 PetscFunctionReturn(0); 2337 2338 } 2339 2340 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2341 { 2342 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2343 PetscErrorCode ierr; 2344 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2345 PetscScalar *x,zero = 0.0; 2346 MatScalar *aa,*aa_j; 2347 2348 PetscFunctionBegin; 2349 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2350 bs = A->rmap->bs; 2351 aa = a->a; 2352 ai = a->i; 2353 aj = a->j; 2354 ambs = a->mbs; 2355 bs2 = a->bs2; 2356 2357 ierr = VecSet(v,zero);CHKERRQ(ierr); 2358 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2359 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2360 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2361 for (i=0; i<ambs; i++) { 2362 for (j=ai[i]; j<ai[i+1]; j++) { 2363 if (aj[j] == i) { 2364 row = i*bs; 2365 aa_j = aa+j*bs2; 2366 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2367 break; 2368 } 2369 } 2370 } 2371 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2372 PetscFunctionReturn(0); 2373 } 2374 2375 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2376 { 2377 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2378 const PetscScalar *l,*r,*li,*ri; 2379 PetscScalar x; 2380 MatScalar *aa, *v; 2381 PetscErrorCode ierr; 2382 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2383 const PetscInt *ai,*aj; 2384 2385 PetscFunctionBegin; 2386 ai = a->i; 2387 aj = a->j; 2388 aa = a->a; 2389 m = A->rmap->n; 2390 n = A->cmap->n; 2391 bs = A->rmap->bs; 2392 mbs = a->mbs; 2393 bs2 = a->bs2; 2394 if (ll) { 2395 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2396 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2397 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2398 for (i=0; i<mbs; i++) { /* for each block row */ 2399 M = ai[i+1] - ai[i]; 2400 li = l + i*bs; 2401 v = aa + bs2*ai[i]; 2402 for (j=0; j<M; j++) { /* for each block */ 2403 for (k=0; k<bs2; k++) { 2404 (*v++) *= li[k%bs]; 2405 } 2406 } 2407 } 2408 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2409 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2410 } 2411 2412 if (rr) { 2413 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2414 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2415 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2416 for (i=0; i<mbs; i++) { /* for each block row */ 2417 iai = ai[i]; 2418 M = ai[i+1] - iai; 2419 v = aa + bs2*iai; 2420 for (j=0; j<M; j++) { /* for each block */ 2421 ri = r + bs*aj[iai+j]; 2422 for (k=0; k<bs; k++) { 2423 x = ri[k]; 2424 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2425 v += bs; 2426 } 2427 } 2428 } 2429 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2430 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2431 } 2432 PetscFunctionReturn(0); 2433 } 2434 2435 2436 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2437 { 2438 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2439 2440 PetscFunctionBegin; 2441 info->block_size = a->bs2; 2442 info->nz_allocated = a->bs2*a->maxnz; 2443 info->nz_used = a->bs2*a->nz; 2444 info->nz_unneeded = info->nz_allocated - info->nz_used; 2445 info->assemblies = A->num_ass; 2446 info->mallocs = A->info.mallocs; 2447 info->memory = ((PetscObject)A)->mem; 2448 if (A->factortype) { 2449 info->fill_ratio_given = A->info.fill_ratio_given; 2450 info->fill_ratio_needed = A->info.fill_ratio_needed; 2451 info->factor_mallocs = A->info.factor_mallocs; 2452 } else { 2453 info->fill_ratio_given = 0; 2454 info->fill_ratio_needed = 0; 2455 info->factor_mallocs = 0; 2456 } 2457 PetscFunctionReturn(0); 2458 } 2459 2460 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2461 { 2462 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2463 PetscErrorCode ierr; 2464 2465 PetscFunctionBegin; 2466 ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr); 2467 PetscFunctionReturn(0); 2468 } 2469 2470 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 2471 { 2472 PetscErrorCode ierr; 2473 2474 PetscFunctionBegin; 2475 ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 2476 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 2477 PetscFunctionReturn(0); 2478 } 2479 2480 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2481 { 2482 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2483 PetscScalar *z = NULL,sum1; 2484 const PetscScalar *xb; 2485 PetscScalar x1; 2486 const MatScalar *v,*vv; 2487 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2488 PetscBool usecprow=a->compressedrow.use; 2489 2490 PetscFunctionBegin; 2491 idx = a->j; 2492 v = a->a; 2493 if (usecprow) { 2494 mbs = a->compressedrow.nrows; 2495 ii = a->compressedrow.i; 2496 ridx = a->compressedrow.rindex; 2497 } else { 2498 mbs = a->mbs; 2499 ii = a->i; 2500 z = c; 2501 } 2502 2503 for (i=0; i<mbs; i++) { 2504 n = ii[1] - ii[0]; ii++; 2505 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2506 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2507 if (usecprow) z = c + ridx[i]; 2508 jj = idx; 2509 vv = v; 2510 for (k=0; k<cn; k++) { 2511 idx = jj; 2512 v = vv; 2513 sum1 = 0.0; 2514 for (j=0; j<n; j++) { 2515 xb = b + (*idx++); x1 = xb[0+k*bm]; 2516 sum1 += v[0]*x1; 2517 v += 1; 2518 } 2519 z[0+k*cm] = sum1; 2520 } 2521 if (!usecprow) z += 1; 2522 } 2523 PetscFunctionReturn(0); 2524 } 2525 2526 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2527 { 2528 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2529 PetscScalar *z = NULL,sum1,sum2; 2530 const PetscScalar *xb; 2531 PetscScalar x1,x2; 2532 const MatScalar *v,*vv; 2533 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2534 PetscBool usecprow=a->compressedrow.use; 2535 2536 PetscFunctionBegin; 2537 idx = a->j; 2538 v = a->a; 2539 if (usecprow) { 2540 mbs = a->compressedrow.nrows; 2541 ii = a->compressedrow.i; 2542 ridx = a->compressedrow.rindex; 2543 } else { 2544 mbs = a->mbs; 2545 ii = a->i; 2546 z = c; 2547 } 2548 2549 for (i=0; i<mbs; i++) { 2550 n = ii[1] - ii[0]; ii++; 2551 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2552 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2553 if (usecprow) z = c + 2*ridx[i]; 2554 jj = idx; 2555 vv = v; 2556 for (k=0; k<cn; k++) { 2557 idx = jj; 2558 v = vv; 2559 sum1 = 0.0; sum2 = 0.0; 2560 for (j=0; j<n; j++) { 2561 xb = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; 2562 sum1 += v[0]*x1 + v[2]*x2; 2563 sum2 += v[1]*x1 + v[3]*x2; 2564 v += 4; 2565 } 2566 z[0+k*cm] = sum1; z[1+k*cm] = sum2; 2567 } 2568 if (!usecprow) z += 2; 2569 } 2570 PetscFunctionReturn(0); 2571 } 2572 2573 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2574 { 2575 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2576 PetscScalar *z = NULL,sum1,sum2,sum3; 2577 const PetscScalar *xb; 2578 PetscScalar x1,x2,x3; 2579 const MatScalar *v,*vv; 2580 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2581 PetscBool usecprow=a->compressedrow.use; 2582 2583 PetscFunctionBegin; 2584 idx = a->j; 2585 v = a->a; 2586 if (usecprow) { 2587 mbs = a->compressedrow.nrows; 2588 ii = a->compressedrow.i; 2589 ridx = a->compressedrow.rindex; 2590 } else { 2591 mbs = a->mbs; 2592 ii = a->i; 2593 z = c; 2594 } 2595 2596 for (i=0; i<mbs; i++) { 2597 n = ii[1] - ii[0]; ii++; 2598 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2599 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2600 if (usecprow) z = c + 3*ridx[i]; 2601 jj = idx; 2602 vv = v; 2603 for (k=0; k<cn; k++) { 2604 idx = jj; 2605 v = vv; 2606 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 2607 for (j=0; j<n; j++) { 2608 xb = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; 2609 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 2610 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 2611 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 2612 v += 9; 2613 } 2614 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; 2615 } 2616 if (!usecprow) z += 3; 2617 } 2618 PetscFunctionReturn(0); 2619 } 2620 2621 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2622 { 2623 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2624 PetscScalar *z = NULL,sum1,sum2,sum3,sum4; 2625 const PetscScalar *xb; 2626 PetscScalar x1,x2,x3,x4; 2627 const MatScalar *v,*vv; 2628 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2629 PetscBool usecprow=a->compressedrow.use; 2630 2631 PetscFunctionBegin; 2632 idx = a->j; 2633 v = a->a; 2634 if (usecprow) { 2635 mbs = a->compressedrow.nrows; 2636 ii = a->compressedrow.i; 2637 ridx = a->compressedrow.rindex; 2638 } else { 2639 mbs = a->mbs; 2640 ii = a->i; 2641 z = c; 2642 } 2643 2644 for (i=0; i<mbs; i++) { 2645 n = ii[1] - ii[0]; ii++; 2646 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2647 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2648 if (usecprow) z = c + 4*ridx[i]; 2649 jj = idx; 2650 vv = v; 2651 for (k=0; k<cn; k++) { 2652 idx = jj; 2653 v = vv; 2654 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 2655 for (j=0; j<n; j++) { 2656 xb = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; 2657 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2658 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2659 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2660 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2661 v += 16; 2662 } 2663 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; 2664 } 2665 if (!usecprow) z += 4; 2666 } 2667 PetscFunctionReturn(0); 2668 } 2669 2670 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2671 { 2672 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2673 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5; 2674 const PetscScalar *xb; 2675 PetscScalar x1,x2,x3,x4,x5; 2676 const MatScalar *v,*vv; 2677 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2678 PetscBool usecprow=a->compressedrow.use; 2679 2680 PetscFunctionBegin; 2681 idx = a->j; 2682 v = a->a; 2683 if (usecprow) { 2684 mbs = a->compressedrow.nrows; 2685 ii = a->compressedrow.i; 2686 ridx = a->compressedrow.rindex; 2687 } else { 2688 mbs = a->mbs; 2689 ii = a->i; 2690 z = c; 2691 } 2692 2693 for (i=0; i<mbs; i++) { 2694 n = ii[1] - ii[0]; ii++; 2695 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2696 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2697 if (usecprow) z = c + 5*ridx[i]; 2698 jj = idx; 2699 vv = v; 2700 for (k=0; k<cn; k++) { 2701 idx = jj; 2702 v = vv; 2703 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 2704 for (j=0; j<n; j++) { 2705 xb = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm]; 2706 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2707 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2708 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2709 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2710 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2711 v += 25; 2712 } 2713 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5; 2714 } 2715 if (!usecprow) z += 5; 2716 } 2717 PetscFunctionReturn(0); 2718 } 2719 2720 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C) 2721 { 2722 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2723 Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 2724 Mat_SeqDense *cd = (Mat_SeqDense*)C->data; 2725 PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda; 2726 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 2727 PetscBLASInt bbs,bcn,bbm,bcm; 2728 PetscScalar *z = NULL; 2729 PetscScalar *c,*b; 2730 const MatScalar *v; 2731 const PetscInt *idx,*ii,*ridx=NULL; 2732 PetscScalar _DZero=0.0,_DOne=1.0; 2733 PetscBool usecprow=a->compressedrow.use; 2734 PetscErrorCode ierr; 2735 2736 PetscFunctionBegin; 2737 if (!cm || !cn) PetscFunctionReturn(0); 2738 if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n); 2739 if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n); 2740 if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n); 2741 b = bd->v; 2742 if (a->nonzerorowcnt != A->rmap->n) { 2743 ierr = MatZeroEntries(C);CHKERRQ(ierr); 2744 } 2745 ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 2746 switch (bs) { 2747 case 1: 2748 ierr = MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn); 2749 break; 2750 case 2: 2751 ierr = MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn); 2752 break; 2753 case 3: 2754 ierr = MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn); 2755 break; 2756 case 4: 2757 ierr = MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn); 2758 break; 2759 case 5: 2760 ierr = MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn); 2761 break; 2762 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2763 ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr); 2764 ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr); 2765 ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr); 2766 ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr); 2767 idx = a->j; 2768 v = a->a; 2769 if (usecprow) { 2770 mbs = a->compressedrow.nrows; 2771 ii = a->compressedrow.i; 2772 ridx = a->compressedrow.rindex; 2773 } else { 2774 mbs = a->mbs; 2775 ii = a->i; 2776 z = c; 2777 } 2778 for (i=0; i<mbs; i++) { 2779 n = ii[1] - ii[0]; ii++; 2780 if (usecprow) z = c + bs*ridx[i]; 2781 if (n) { 2782 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm)); 2783 v += bs2; 2784 } 2785 for (j=1; j<n; j++) { 2786 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm)); 2787 v += bs2; 2788 } 2789 if (!usecprow) z += bs; 2790 } 2791 } 2792 ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 2793 ierr = PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);CHKERRQ(ierr); 2794 PetscFunctionReturn(0); 2795 } 2796