1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/impls/dense/seq/dense.h> 4 #include <petsc/private/kernels/blockinvert.h> 5 #include <petscbt.h> 6 #include <petscblaslapack.h> 7 8 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 9 #include <immintrin.h> 10 #endif 11 12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 15 PetscErrorCode ierr; 16 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 17 const PetscInt *idx; 18 PetscInt start,end,*ai,*aj,bs,*nidx2; 19 PetscBT table; 20 21 PetscFunctionBegin; 22 m = a->mbs; 23 ai = a->i; 24 aj = a->j; 25 bs = A->rmap->bs; 26 27 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 29 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 30 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 31 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 32 33 for (i=0; i<is_max; i++) { 34 /* Initialise the two local arrays */ 35 isz = 0; 36 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 37 38 /* Extract the indices, assume there can be duplicate entries */ 39 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 40 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 41 42 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 43 for (j=0; j<n; ++j) { 44 ival = idx[j]/bs; /* convert the indices into block indices */ 45 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 46 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++) { /* for each overlap*/ 53 n = isz; 54 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 55 row = nidx[k]; 56 start = ai[row]; 57 end = ai[row+1]; 58 for (l = start; l<end; l++) { 59 val = aj[l]; 60 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 61 } 62 } 63 } 64 /* expand the Index Set */ 65 for (j=0; j<isz; j++) { 66 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 67 } 68 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 69 } 70 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 71 ierr = PetscFree(nidx);CHKERRQ(ierr); 72 ierr = PetscFree(nidx2);CHKERRQ(ierr); 73 PetscFunctionReturn(0); 74 } 75 76 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 77 { 78 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 79 PetscErrorCode ierr; 80 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 81 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 82 const PetscInt *irow,*icol; 83 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 84 PetscInt *aj = a->j,*ai = a->i; 85 MatScalar *mat_a; 86 Mat C; 87 PetscBool flag; 88 89 PetscFunctionBegin; 90 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 91 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 92 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 93 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 94 95 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 96 ssmap = smap; 97 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 98 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 99 /* determine lens of each row */ 100 for (i=0; i<nrows; i++) { 101 kstart = ai[irow[i]]; 102 kend = kstart + a->ilen[irow[i]]; 103 lens[i] = 0; 104 for (k=kstart; k<kend; k++) { 105 if (ssmap[aj[k]]) lens[i]++; 106 } 107 } 108 /* Create and fill new matrix */ 109 if (scall == MAT_REUSE_MATRIX) { 110 c = (Mat_SeqBAIJ*)((*B)->data); 111 112 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 113 ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr); 114 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 115 ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr); 116 C = *B; 117 } else { 118 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 119 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 120 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 121 ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 122 } 123 c = (Mat_SeqBAIJ*)(C->data); 124 for (i=0; i<nrows; i++) { 125 row = irow[i]; 126 kstart = ai[row]; 127 kend = kstart + a->ilen[row]; 128 mat_i = c->i[i]; 129 mat_j = c->j + mat_i; 130 mat_a = c->a + mat_i*bs2; 131 mat_ilen = c->ilen + i; 132 for (k=kstart; k<kend; k++) { 133 if ((tcol=ssmap[a->j[k]])) { 134 *mat_j++ = tcol - 1; 135 ierr = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr); 136 mat_a += bs2; 137 (*mat_ilen)++; 138 } 139 } 140 } 141 /* sort */ 142 { 143 MatScalar *work; 144 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 145 for (i=0; i<nrows; i++) { 146 PetscInt ilen; 147 mat_i = c->i[i]; 148 mat_j = c->j + mat_i; 149 mat_a = c->a + mat_i*bs2; 150 ilen = c->ilen[i]; 151 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 152 } 153 ierr = PetscFree(work);CHKERRQ(ierr); 154 } 155 156 /* Free work space */ 157 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 158 ierr = PetscFree(smap);CHKERRQ(ierr); 159 ierr = PetscFree(lens);CHKERRQ(ierr); 160 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 161 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 162 163 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 164 *B = C; 165 PetscFunctionReturn(0); 166 } 167 168 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 169 { 170 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 171 IS is1,is2; 172 PetscErrorCode ierr; 173 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 174 const PetscInt *irow,*icol; 175 176 PetscFunctionBegin; 177 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 178 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 179 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 180 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 181 182 /* Verify if the indices corespond to each element in a block 183 and form the IS with compressed IS */ 184 maxmnbs = PetscMax(a->mbs,a->nbs); 185 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 186 ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr); 187 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 188 for (i=0; i<a->mbs; i++) { 189 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 190 } 191 count = 0; 192 for (i=0; i<nrows; i++) { 193 j = irow[i] / bs; 194 if ((vary[j]--)==bs) iary[count++] = j; 195 } 196 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 197 198 ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr); 199 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 200 for (i=0; i<a->nbs; i++) { 201 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 202 } 203 count = 0; 204 for (i=0; i<ncols; i++) { 205 j = icol[i] / bs; 206 if ((vary[j]--)==bs) iary[count++] = j; 207 } 208 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 209 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 210 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 211 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 212 213 ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 214 ierr = ISDestroy(&is1);CHKERRQ(ierr); 215 ierr = ISDestroy(&is2);CHKERRQ(ierr); 216 PetscFunctionReturn(0); 217 } 218 219 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 220 { 221 PetscErrorCode ierr; 222 Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data; 223 Mat_SubSppt *submatj = c->submatis1; 224 225 PetscFunctionBegin; 226 ierr = (*submatj->destroy)(C);CHKERRQ(ierr); 227 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 228 PetscFunctionReturn(0); 229 } 230 231 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[]) 232 { 233 PetscErrorCode ierr; 234 PetscInt i; 235 Mat C; 236 Mat_SeqBAIJ *c; 237 Mat_SubSppt *submatj; 238 239 PetscFunctionBegin; 240 for (i=0; i<n; i++) { 241 C = (*mat)[i]; 242 c = (Mat_SeqBAIJ*)C->data; 243 submatj = c->submatis1; 244 if (submatj) { 245 ierr = (*submatj->destroy)(C);CHKERRQ(ierr); 246 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 247 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 248 ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr); 249 ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr); 250 ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr); 251 } else { 252 ierr = MatDestroy(&C);CHKERRQ(ierr); 253 } 254 } 255 ierr = PetscFree(*mat);CHKERRQ(ierr); 256 PetscFunctionReturn(0); 257 } 258 259 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 260 { 261 PetscErrorCode ierr; 262 PetscInt i; 263 264 PetscFunctionBegin; 265 if (scall == MAT_INITIAL_MATRIX) { 266 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 267 } 268 269 for (i=0; i<n; i++) { 270 ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 271 } 272 PetscFunctionReturn(0); 273 } 274 275 /* -------------------------------------------------------*/ 276 /* Should check that shapes of vectors and matrices match */ 277 /* -------------------------------------------------------*/ 278 279 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 280 { 281 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 282 PetscScalar *z,sum; 283 const PetscScalar *x; 284 const MatScalar *v; 285 PetscErrorCode ierr; 286 PetscInt mbs,i,n; 287 const PetscInt *idx,*ii,*ridx=NULL; 288 PetscBool usecprow=a->compressedrow.use; 289 290 PetscFunctionBegin; 291 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 292 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 293 294 if (usecprow) { 295 mbs = a->compressedrow.nrows; 296 ii = a->compressedrow.i; 297 ridx = a->compressedrow.rindex; 298 ierr = PetscArrayzero(z,a->mbs);CHKERRQ(ierr); 299 } else { 300 mbs = a->mbs; 301 ii = a->i; 302 } 303 304 for (i=0; i<mbs; i++) { 305 n = ii[1] - ii[0]; 306 v = a->a + ii[0]; 307 idx = a->j + ii[0]; 308 ii++; 309 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 310 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 311 sum = 0.0; 312 PetscSparseDensePlusDot(sum,x,v,idx,n); 313 if (usecprow) { 314 z[ridx[i]] = sum; 315 } else { 316 z[i] = sum; 317 } 318 } 319 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 320 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 321 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 322 PetscFunctionReturn(0); 323 } 324 325 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 326 { 327 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 328 PetscScalar *z = 0,sum1,sum2,*zarray; 329 const PetscScalar *x,*xb; 330 PetscScalar x1,x2; 331 const MatScalar *v; 332 PetscErrorCode ierr; 333 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 334 PetscBool usecprow=a->compressedrow.use; 335 336 PetscFunctionBegin; 337 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 338 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 339 340 idx = a->j; 341 v = a->a; 342 if (usecprow) { 343 mbs = a->compressedrow.nrows; 344 ii = a->compressedrow.i; 345 ridx = a->compressedrow.rindex; 346 ierr = PetscArrayzero(zarray,2*a->mbs);CHKERRQ(ierr); 347 } else { 348 mbs = a->mbs; 349 ii = a->i; 350 z = zarray; 351 } 352 353 for (i=0; i<mbs; i++) { 354 n = ii[1] - ii[0]; ii++; 355 sum1 = 0.0; sum2 = 0.0; 356 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 357 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 358 for (j=0; j<n; j++) { 359 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 360 sum1 += v[0]*x1 + v[2]*x2; 361 sum2 += v[1]*x1 + v[3]*x2; 362 v += 4; 363 } 364 if (usecprow) z = zarray + 2*ridx[i]; 365 z[0] = sum1; z[1] = sum2; 366 if (!usecprow) z += 2; 367 } 368 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 369 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 370 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 371 PetscFunctionReturn(0); 372 } 373 374 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 375 { 376 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 377 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 378 const PetscScalar *x,*xb; 379 const MatScalar *v; 380 PetscErrorCode ierr; 381 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 382 PetscBool usecprow=a->compressedrow.use; 383 384 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 385 #pragma disjoint(*v,*z,*xb) 386 #endif 387 388 PetscFunctionBegin; 389 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 390 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 391 392 idx = a->j; 393 v = a->a; 394 if (usecprow) { 395 mbs = a->compressedrow.nrows; 396 ii = a->compressedrow.i; 397 ridx = a->compressedrow.rindex; 398 ierr = PetscArrayzero(zarray,3*a->mbs);CHKERRQ(ierr); 399 } else { 400 mbs = a->mbs; 401 ii = a->i; 402 z = zarray; 403 } 404 405 for (i=0; i<mbs; i++) { 406 n = ii[1] - ii[0]; ii++; 407 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 408 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 409 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 410 for (j=0; j<n; j++) { 411 xb = x + 3*(*idx++); 412 x1 = xb[0]; 413 x2 = xb[1]; 414 x3 = xb[2]; 415 416 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 417 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 418 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 419 v += 9; 420 } 421 if (usecprow) z = zarray + 3*ridx[i]; 422 z[0] = sum1; z[1] = sum2; z[2] = sum3; 423 if (!usecprow) z += 3; 424 } 425 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 426 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 427 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 428 PetscFunctionReturn(0); 429 } 430 431 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 432 { 433 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 434 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 435 const PetscScalar *x,*xb; 436 const MatScalar *v; 437 PetscErrorCode ierr; 438 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 439 PetscBool usecprow=a->compressedrow.use; 440 441 PetscFunctionBegin; 442 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 443 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 444 445 idx = a->j; 446 v = a->a; 447 if (usecprow) { 448 mbs = a->compressedrow.nrows; 449 ii = a->compressedrow.i; 450 ridx = a->compressedrow.rindex; 451 ierr = PetscArrayzero(zarray,4*a->mbs);CHKERRQ(ierr); 452 } else { 453 mbs = a->mbs; 454 ii = a->i; 455 z = zarray; 456 } 457 458 for (i=0; i<mbs; i++) { 459 n = ii[1] - ii[0]; 460 ii++; 461 sum1 = 0.0; 462 sum2 = 0.0; 463 sum3 = 0.0; 464 sum4 = 0.0; 465 466 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 467 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 468 for (j=0; j<n; j++) { 469 xb = x + 4*(*idx++); 470 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 471 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 472 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 473 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 474 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 475 v += 16; 476 } 477 if (usecprow) z = zarray + 4*ridx[i]; 478 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 479 if (!usecprow) z += 4; 480 } 481 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 482 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 483 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 484 PetscFunctionReturn(0); 485 } 486 487 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 488 { 489 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 490 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 491 const PetscScalar *xb,*x; 492 const MatScalar *v; 493 PetscErrorCode ierr; 494 const PetscInt *idx,*ii,*ridx=NULL; 495 PetscInt mbs,i,j,n; 496 PetscBool usecprow=a->compressedrow.use; 497 498 PetscFunctionBegin; 499 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 500 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 501 502 idx = a->j; 503 v = a->a; 504 if (usecprow) { 505 mbs = a->compressedrow.nrows; 506 ii = a->compressedrow.i; 507 ridx = a->compressedrow.rindex; 508 ierr = PetscArrayzero(zarray,5*a->mbs);CHKERRQ(ierr); 509 } else { 510 mbs = a->mbs; 511 ii = a->i; 512 z = zarray; 513 } 514 515 for (i=0; i<mbs; i++) { 516 n = ii[1] - ii[0]; ii++; 517 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 518 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 519 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 520 for (j=0; j<n; j++) { 521 xb = x + 5*(*idx++); 522 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 523 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 524 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 525 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 526 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 527 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 528 v += 25; 529 } 530 if (usecprow) z = zarray + 5*ridx[i]; 531 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 532 if (!usecprow) z += 5; 533 } 534 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 535 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 536 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 537 PetscFunctionReturn(0); 538 } 539 540 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 541 { 542 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 543 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 544 const PetscScalar *x,*xb; 545 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 546 const MatScalar *v; 547 PetscErrorCode ierr; 548 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 549 PetscBool usecprow=a->compressedrow.use; 550 551 PetscFunctionBegin; 552 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 553 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 554 555 idx = a->j; 556 v = a->a; 557 if (usecprow) { 558 mbs = a->compressedrow.nrows; 559 ii = a->compressedrow.i; 560 ridx = a->compressedrow.rindex; 561 ierr = PetscArrayzero(zarray,6*a->mbs);CHKERRQ(ierr); 562 } else { 563 mbs = a->mbs; 564 ii = a->i; 565 z = zarray; 566 } 567 568 for (i=0; i<mbs; i++) { 569 n = ii[1] - ii[0]; 570 ii++; 571 sum1 = 0.0; 572 sum2 = 0.0; 573 sum3 = 0.0; 574 sum4 = 0.0; 575 sum5 = 0.0; 576 sum6 = 0.0; 577 578 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 579 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 580 for (j=0; j<n; j++) { 581 xb = x + 6*(*idx++); 582 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 583 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 584 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 585 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 586 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 587 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 588 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 589 v += 36; 590 } 591 if (usecprow) z = zarray + 6*ridx[i]; 592 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 593 if (!usecprow) z += 6; 594 } 595 596 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 597 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 598 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 599 PetscFunctionReturn(0); 600 } 601 602 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 603 { 604 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 605 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 606 const PetscScalar *x,*xb; 607 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 608 const MatScalar *v; 609 PetscErrorCode ierr; 610 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 611 PetscBool usecprow=a->compressedrow.use; 612 613 PetscFunctionBegin; 614 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 615 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 616 617 idx = a->j; 618 v = a->a; 619 if (usecprow) { 620 mbs = a->compressedrow.nrows; 621 ii = a->compressedrow.i; 622 ridx = a->compressedrow.rindex; 623 ierr = PetscArrayzero(zarray,7*a->mbs);CHKERRQ(ierr); 624 } else { 625 mbs = a->mbs; 626 ii = a->i; 627 z = zarray; 628 } 629 630 for (i=0; i<mbs; i++) { 631 n = ii[1] - ii[0]; 632 ii++; 633 sum1 = 0.0; 634 sum2 = 0.0; 635 sum3 = 0.0; 636 sum4 = 0.0; 637 sum5 = 0.0; 638 sum6 = 0.0; 639 sum7 = 0.0; 640 641 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 642 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 643 for (j=0; j<n; j++) { 644 xb = x + 7*(*idx++); 645 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 646 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 647 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 648 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 649 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 650 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 651 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 652 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 653 v += 49; 654 } 655 if (usecprow) z = zarray + 7*ridx[i]; 656 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 657 if (!usecprow) z += 7; 658 } 659 660 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 661 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 662 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 663 PetscFunctionReturn(0); 664 } 665 666 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 667 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz) 668 { 669 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 670 PetscScalar *z = 0,*work,*workt,*zarray; 671 const PetscScalar *x,*xb; 672 const MatScalar *v; 673 PetscErrorCode ierr; 674 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 675 const PetscInt *idx,*ii,*ridx=NULL; 676 PetscInt k; 677 PetscBool usecprow=a->compressedrow.use; 678 679 __m256d a0,a1,a2,a3,a4,a5; 680 __m256d w0,w1,w2,w3; 681 __m256d z0,z1,z2; 682 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 683 684 PetscFunctionBegin; 685 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 686 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 687 688 idx = a->j; 689 v = a->a; 690 if (usecprow) { 691 mbs = a->compressedrow.nrows; 692 ii = a->compressedrow.i; 693 ridx = a->compressedrow.rindex; 694 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr); 695 } else { 696 mbs = a->mbs; 697 ii = a->i; 698 z = zarray; 699 } 700 701 if (!a->mult_work) { 702 k = PetscMax(A->rmap->n,A->cmap->n); 703 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 704 } 705 706 work = a->mult_work; 707 for (i=0; i<mbs; i++) { 708 n = ii[1] - ii[0]; ii++; 709 workt = work; 710 for (j=0; j<n; j++) { 711 xb = x + bs*(*idx++); 712 for (k=0; k<bs; k++) workt[k] = xb[k]; 713 workt += bs; 714 } 715 if (usecprow) z = zarray + bs*ridx[i]; 716 717 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 718 719 for (j=0; j<n; j++) { 720 /* first column of a */ 721 w0 = _mm256_set1_pd(work[j*9 ]); 722 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 723 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 724 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 725 726 /* second column of a */ 727 w1 = _mm256_set1_pd(work[j*9+ 1]); 728 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 729 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 730 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 731 732 /* third column of a */ 733 w2 = _mm256_set1_pd(work[j*9 +2]); 734 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 735 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 736 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 737 738 /* fourth column of a */ 739 w3 = _mm256_set1_pd(work[j*9+ 3]); 740 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 741 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 742 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 743 744 /* fifth column of a */ 745 w0 = _mm256_set1_pd(work[j*9+ 4]); 746 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 747 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 748 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 749 750 /* sixth column of a */ 751 w1 = _mm256_set1_pd(work[j*9+ 5]); 752 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 753 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 754 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 755 756 /* seventh column of a */ 757 w2 = _mm256_set1_pd(work[j*9+ 6]); 758 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 759 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 760 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 761 762 /* eigth column of a */ 763 w3 = _mm256_set1_pd(work[j*9+ 7]); 764 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 765 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 766 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 767 768 /* ninth column of a */ 769 w0 = _mm256_set1_pd(work[j*9+ 8]); 770 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 771 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 772 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 773 } 774 775 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 776 777 v += n*bs2; 778 if (!usecprow) z += bs; 779 } 780 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 781 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 782 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 783 PetscFunctionReturn(0); 784 } 785 #endif 786 787 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz) 788 { 789 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 790 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 791 const PetscScalar *x,*xb; 792 PetscScalar *zarray,xv; 793 const MatScalar *v; 794 PetscErrorCode ierr; 795 const PetscInt *ii,*ij=a->j,*idx; 796 PetscInt mbs,i,j,k,n,*ridx=NULL; 797 PetscBool usecprow=a->compressedrow.use; 798 799 PetscFunctionBegin; 800 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 801 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 802 803 v = a->a; 804 if (usecprow) { 805 mbs = a->compressedrow.nrows; 806 ii = a->compressedrow.i; 807 ridx = a->compressedrow.rindex; 808 ierr = PetscArrayzero(zarray,11*a->mbs);CHKERRQ(ierr); 809 } else { 810 mbs = a->mbs; 811 ii = a->i; 812 z = zarray; 813 } 814 815 for (i=0; i<mbs; i++) { 816 n = ii[i+1] - ii[i]; 817 idx = ij + ii[i]; 818 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 819 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; 820 821 for (j=0; j<n; j++) { 822 xb = x + 11*(idx[j]); 823 824 for (k=0; k<11; k++) { 825 xv = xb[k]; 826 sum1 += v[0]*xv; 827 sum2 += v[1]*xv; 828 sum3 += v[2]*xv; 829 sum4 += v[3]*xv; 830 sum5 += v[4]*xv; 831 sum6 += v[5]*xv; 832 sum7 += v[6]*xv; 833 sum8 += v[7]*xv; 834 sum9 += v[8]*xv; 835 sum10 += v[9]*xv; 836 sum11 += v[10]*xv; 837 v += 11; 838 } 839 } 840 if (usecprow) z = zarray + 11*ridx[i]; 841 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 842 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 843 844 if (!usecprow) z += 11; 845 } 846 847 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 848 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 849 ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr); 850 PetscFunctionReturn(0); 851 } 852 853 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 854 /* Default MatMult for block size 15 */ 855 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 856 { 857 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 858 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 859 const PetscScalar *x,*xb; 860 PetscScalar *zarray,xv; 861 const MatScalar *v; 862 PetscErrorCode ierr; 863 const PetscInt *ii,*ij=a->j,*idx; 864 PetscInt mbs,i,j,k,n,*ridx=NULL; 865 PetscBool usecprow=a->compressedrow.use; 866 867 PetscFunctionBegin; 868 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 869 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 870 871 v = a->a; 872 if (usecprow) { 873 mbs = a->compressedrow.nrows; 874 ii = a->compressedrow.i; 875 ridx = a->compressedrow.rindex; 876 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 877 } else { 878 mbs = a->mbs; 879 ii = a->i; 880 z = zarray; 881 } 882 883 for (i=0; i<mbs; i++) { 884 n = ii[i+1] - ii[i]; 885 idx = ij + ii[i]; 886 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 887 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 888 889 for (j=0; j<n; j++) { 890 xb = x + 15*(idx[j]); 891 892 for (k=0; k<15; k++) { 893 xv = xb[k]; 894 sum1 += v[0]*xv; 895 sum2 += v[1]*xv; 896 sum3 += v[2]*xv; 897 sum4 += v[3]*xv; 898 sum5 += v[4]*xv; 899 sum6 += v[5]*xv; 900 sum7 += v[6]*xv; 901 sum8 += v[7]*xv; 902 sum9 += v[8]*xv; 903 sum10 += v[9]*xv; 904 sum11 += v[10]*xv; 905 sum12 += v[11]*xv; 906 sum13 += v[12]*xv; 907 sum14 += v[13]*xv; 908 sum15 += v[14]*xv; 909 v += 15; 910 } 911 } 912 if (usecprow) z = zarray + 15*ridx[i]; 913 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 914 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 915 916 if (!usecprow) z += 15; 917 } 918 919 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 920 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 921 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 922 PetscFunctionReturn(0); 923 } 924 925 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 926 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 927 { 928 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 929 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 930 const PetscScalar *x,*xb; 931 PetscScalar x1,x2,x3,x4,*zarray; 932 const MatScalar *v; 933 PetscErrorCode ierr; 934 const PetscInt *ii,*ij=a->j,*idx; 935 PetscInt mbs,i,j,n,*ridx=NULL; 936 PetscBool usecprow=a->compressedrow.use; 937 938 PetscFunctionBegin; 939 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 940 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 941 942 v = a->a; 943 if (usecprow) { 944 mbs = a->compressedrow.nrows; 945 ii = a->compressedrow.i; 946 ridx = a->compressedrow.rindex; 947 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 948 } else { 949 mbs = a->mbs; 950 ii = a->i; 951 z = zarray; 952 } 953 954 for (i=0; i<mbs; i++) { 955 n = ii[i+1] - ii[i]; 956 idx = ij + ii[i]; 957 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 958 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 959 960 for (j=0; j<n; j++) { 961 xb = x + 15*(idx[j]); 962 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 963 964 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 965 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 966 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 967 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 968 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 969 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 970 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 971 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 972 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 973 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 974 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 975 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 976 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 977 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 978 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 979 980 v += 60; 981 982 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 983 984 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 985 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 986 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 987 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 988 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 989 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 990 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 991 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 992 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 993 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 994 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 995 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 996 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 997 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 998 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 999 v += 60; 1000 1001 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1002 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1003 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1004 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1005 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1006 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1007 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1008 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1009 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1010 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1011 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1012 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1013 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1014 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1015 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1016 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1017 v += 60; 1018 1019 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1020 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1021 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1022 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1023 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1024 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1025 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1026 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1027 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1028 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1029 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1030 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1031 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1032 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1033 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1034 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1035 v += 45; 1036 } 1037 if (usecprow) z = zarray + 15*ridx[i]; 1038 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1039 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1040 1041 if (!usecprow) z += 15; 1042 } 1043 1044 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1045 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1046 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1047 PetscFunctionReturn(0); 1048 } 1049 1050 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1051 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1052 { 1053 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1054 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1055 const PetscScalar *x,*xb; 1056 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1057 const MatScalar *v; 1058 PetscErrorCode ierr; 1059 const PetscInt *ii,*ij=a->j,*idx; 1060 PetscInt mbs,i,j,n,*ridx=NULL; 1061 PetscBool usecprow=a->compressedrow.use; 1062 1063 PetscFunctionBegin; 1064 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1065 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1066 1067 v = a->a; 1068 if (usecprow) { 1069 mbs = a->compressedrow.nrows; 1070 ii = a->compressedrow.i; 1071 ridx = a->compressedrow.rindex; 1072 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1073 } else { 1074 mbs = a->mbs; 1075 ii = a->i; 1076 z = zarray; 1077 } 1078 1079 for (i=0; i<mbs; i++) { 1080 n = ii[i+1] - ii[i]; 1081 idx = ij + ii[i]; 1082 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1083 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1084 1085 for (j=0; j<n; j++) { 1086 xb = x + 15*(idx[j]); 1087 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1088 x8 = xb[7]; 1089 1090 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1091 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1092 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1093 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1094 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1095 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1096 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1097 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1098 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1099 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1100 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1101 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1102 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1103 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1104 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1105 v += 120; 1106 1107 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1108 1109 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1110 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1111 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1112 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1113 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1114 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1115 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1116 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1117 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1118 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1119 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1120 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1121 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1122 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1123 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1124 v += 105; 1125 } 1126 if (usecprow) z = zarray + 15*ridx[i]; 1127 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1128 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1129 1130 if (!usecprow) z += 15; 1131 } 1132 1133 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1134 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1135 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1136 PetscFunctionReturn(0); 1137 } 1138 1139 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1140 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1141 { 1142 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1143 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1144 const PetscScalar *x,*xb; 1145 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1146 const MatScalar *v; 1147 PetscErrorCode ierr; 1148 const PetscInt *ii,*ij=a->j,*idx; 1149 PetscInt mbs,i,j,n,*ridx=NULL; 1150 PetscBool usecprow=a->compressedrow.use; 1151 1152 PetscFunctionBegin; 1153 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1154 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1155 1156 v = a->a; 1157 if (usecprow) { 1158 mbs = a->compressedrow.nrows; 1159 ii = a->compressedrow.i; 1160 ridx = a->compressedrow.rindex; 1161 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1162 } else { 1163 mbs = a->mbs; 1164 ii = a->i; 1165 z = zarray; 1166 } 1167 1168 for (i=0; i<mbs; i++) { 1169 n = ii[i+1] - ii[i]; 1170 idx = ij + ii[i]; 1171 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1172 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1173 1174 for (j=0; j<n; j++) { 1175 xb = x + 15*(idx[j]); 1176 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1177 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1178 1179 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1180 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1181 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1182 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1183 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1184 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1185 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1186 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1187 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1188 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1189 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1190 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1191 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1192 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1193 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1194 v += 225; 1195 } 1196 if (usecprow) z = zarray + 15*ridx[i]; 1197 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1198 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1199 1200 if (!usecprow) z += 15; 1201 } 1202 1203 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1204 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1205 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1206 PetscFunctionReturn(0); 1207 } 1208 1209 /* 1210 This will not work with MatScalar == float because it calls the BLAS 1211 */ 1212 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1213 { 1214 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1215 PetscScalar *z = 0,*work,*workt,*zarray; 1216 const PetscScalar *x,*xb; 1217 const MatScalar *v; 1218 PetscErrorCode ierr; 1219 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1220 const PetscInt *idx,*ii,*ridx=NULL; 1221 PetscInt ncols,k; 1222 PetscBool usecprow=a->compressedrow.use; 1223 1224 PetscFunctionBegin; 1225 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1226 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1227 1228 idx = a->j; 1229 v = a->a; 1230 if (usecprow) { 1231 mbs = a->compressedrow.nrows; 1232 ii = a->compressedrow.i; 1233 ridx = a->compressedrow.rindex; 1234 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr); 1235 } else { 1236 mbs = a->mbs; 1237 ii = a->i; 1238 z = zarray; 1239 } 1240 1241 if (!a->mult_work) { 1242 k = PetscMax(A->rmap->n,A->cmap->n); 1243 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1244 } 1245 work = a->mult_work; 1246 for (i=0; i<mbs; i++) { 1247 n = ii[1] - ii[0]; ii++; 1248 ncols = n*bs; 1249 workt = work; 1250 for (j=0; j<n; j++) { 1251 xb = x + bs*(*idx++); 1252 for (k=0; k<bs; k++) workt[k] = xb[k]; 1253 workt += bs; 1254 } 1255 if (usecprow) z = zarray + bs*ridx[i]; 1256 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1257 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1258 v += n*bs2; 1259 if (!usecprow) z += bs; 1260 } 1261 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1262 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1263 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1264 PetscFunctionReturn(0); 1265 } 1266 1267 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1268 { 1269 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1270 const PetscScalar *x; 1271 PetscScalar *y,*z,sum; 1272 const MatScalar *v; 1273 PetscErrorCode ierr; 1274 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1275 const PetscInt *idx,*ii; 1276 PetscBool usecprow=a->compressedrow.use; 1277 1278 PetscFunctionBegin; 1279 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1280 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1281 1282 idx = a->j; 1283 v = a->a; 1284 if (usecprow) { 1285 if (zz != yy) { 1286 ierr = PetscArraycpy(z,y,mbs);CHKERRQ(ierr); 1287 } 1288 mbs = a->compressedrow.nrows; 1289 ii = a->compressedrow.i; 1290 ridx = a->compressedrow.rindex; 1291 } else { 1292 ii = a->i; 1293 } 1294 1295 for (i=0; i<mbs; i++) { 1296 n = ii[1] - ii[0]; 1297 ii++; 1298 if (!usecprow) { 1299 sum = y[i]; 1300 } else { 1301 sum = y[ridx[i]]; 1302 } 1303 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1304 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1305 PetscSparseDensePlusDot(sum,x,v,idx,n); 1306 v += n; 1307 idx += n; 1308 if (usecprow) { 1309 z[ridx[i]] = sum; 1310 } else { 1311 z[i] = sum; 1312 } 1313 } 1314 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1315 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1316 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1317 PetscFunctionReturn(0); 1318 } 1319 1320 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1321 { 1322 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1323 PetscScalar *y = 0,*z = 0,sum1,sum2; 1324 const PetscScalar *x,*xb; 1325 PetscScalar x1,x2,*yarray,*zarray; 1326 const MatScalar *v; 1327 PetscErrorCode ierr; 1328 PetscInt mbs = a->mbs,i,n,j; 1329 const PetscInt *idx,*ii,*ridx = NULL; 1330 PetscBool usecprow = a->compressedrow.use; 1331 1332 PetscFunctionBegin; 1333 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1334 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1335 1336 idx = a->j; 1337 v = a->a; 1338 if (usecprow) { 1339 if (zz != yy) { 1340 ierr = PetscArraycpy(zarray,yarray,2*mbs);CHKERRQ(ierr); 1341 } 1342 mbs = a->compressedrow.nrows; 1343 ii = a->compressedrow.i; 1344 ridx = a->compressedrow.rindex; 1345 } else { 1346 ii = a->i; 1347 y = yarray; 1348 z = zarray; 1349 } 1350 1351 for (i=0; i<mbs; i++) { 1352 n = ii[1] - ii[0]; ii++; 1353 if (usecprow) { 1354 z = zarray + 2*ridx[i]; 1355 y = yarray + 2*ridx[i]; 1356 } 1357 sum1 = y[0]; sum2 = y[1]; 1358 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1359 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1360 for (j=0; j<n; j++) { 1361 xb = x + 2*(*idx++); 1362 x1 = xb[0]; 1363 x2 = xb[1]; 1364 1365 sum1 += v[0]*x1 + v[2]*x2; 1366 sum2 += v[1]*x1 + v[3]*x2; 1367 v += 4; 1368 } 1369 z[0] = sum1; z[1] = sum2; 1370 if (!usecprow) { 1371 z += 2; y += 2; 1372 } 1373 } 1374 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1375 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1376 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1377 PetscFunctionReturn(0); 1378 } 1379 1380 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1381 { 1382 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1383 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1384 const PetscScalar *x,*xb; 1385 const MatScalar *v; 1386 PetscErrorCode ierr; 1387 PetscInt mbs = a->mbs,i,j,n; 1388 const PetscInt *idx,*ii,*ridx = NULL; 1389 PetscBool usecprow = a->compressedrow.use; 1390 1391 PetscFunctionBegin; 1392 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1393 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1394 1395 idx = a->j; 1396 v = a->a; 1397 if (usecprow) { 1398 if (zz != yy) { 1399 ierr = PetscArraycpy(zarray,yarray,3*mbs);CHKERRQ(ierr); 1400 } 1401 mbs = a->compressedrow.nrows; 1402 ii = a->compressedrow.i; 1403 ridx = a->compressedrow.rindex; 1404 } else { 1405 ii = a->i; 1406 y = yarray; 1407 z = zarray; 1408 } 1409 1410 for (i=0; i<mbs; i++) { 1411 n = ii[1] - ii[0]; ii++; 1412 if (usecprow) { 1413 z = zarray + 3*ridx[i]; 1414 y = yarray + 3*ridx[i]; 1415 } 1416 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1417 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1418 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1419 for (j=0; j<n; j++) { 1420 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1421 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1422 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1423 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1424 v += 9; 1425 } 1426 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1427 if (!usecprow) { 1428 z += 3; y += 3; 1429 } 1430 } 1431 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1432 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1433 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1434 PetscFunctionReturn(0); 1435 } 1436 1437 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1438 { 1439 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1440 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1441 const PetscScalar *x,*xb; 1442 const MatScalar *v; 1443 PetscErrorCode ierr; 1444 PetscInt mbs = a->mbs,i,j,n; 1445 const PetscInt *idx,*ii,*ridx=NULL; 1446 PetscBool usecprow=a->compressedrow.use; 1447 1448 PetscFunctionBegin; 1449 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1450 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1451 1452 idx = a->j; 1453 v = a->a; 1454 if (usecprow) { 1455 if (zz != yy) { 1456 ierr = PetscArraycpy(zarray,yarray,4*mbs);CHKERRQ(ierr); 1457 } 1458 mbs = a->compressedrow.nrows; 1459 ii = a->compressedrow.i; 1460 ridx = a->compressedrow.rindex; 1461 } else { 1462 ii = a->i; 1463 y = yarray; 1464 z = zarray; 1465 } 1466 1467 for (i=0; i<mbs; i++) { 1468 n = ii[1] - ii[0]; ii++; 1469 if (usecprow) { 1470 z = zarray + 4*ridx[i]; 1471 y = yarray + 4*ridx[i]; 1472 } 1473 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1474 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1475 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1476 for (j=0; j<n; j++) { 1477 xb = x + 4*(*idx++); 1478 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1479 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1480 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1481 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1482 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1483 v += 16; 1484 } 1485 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1486 if (!usecprow) { 1487 z += 4; y += 4; 1488 } 1489 } 1490 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1491 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1492 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1493 PetscFunctionReturn(0); 1494 } 1495 1496 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1497 { 1498 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1499 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1500 const PetscScalar *x,*xb; 1501 PetscScalar *yarray,*zarray; 1502 const MatScalar *v; 1503 PetscErrorCode ierr; 1504 PetscInt mbs = a->mbs,i,j,n; 1505 const PetscInt *idx,*ii,*ridx = NULL; 1506 PetscBool usecprow=a->compressedrow.use; 1507 1508 PetscFunctionBegin; 1509 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1510 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1511 1512 idx = a->j; 1513 v = a->a; 1514 if (usecprow) { 1515 if (zz != yy) { 1516 ierr = PetscArraycpy(zarray,yarray,5*mbs);CHKERRQ(ierr); 1517 } 1518 mbs = a->compressedrow.nrows; 1519 ii = a->compressedrow.i; 1520 ridx = a->compressedrow.rindex; 1521 } else { 1522 ii = a->i; 1523 y = yarray; 1524 z = zarray; 1525 } 1526 1527 for (i=0; i<mbs; i++) { 1528 n = ii[1] - ii[0]; ii++; 1529 if (usecprow) { 1530 z = zarray + 5*ridx[i]; 1531 y = yarray + 5*ridx[i]; 1532 } 1533 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1534 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1535 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1536 for (j=0; j<n; j++) { 1537 xb = x + 5*(*idx++); 1538 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1539 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1540 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1541 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1542 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1543 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1544 v += 25; 1545 } 1546 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1547 if (!usecprow) { 1548 z += 5; y += 5; 1549 } 1550 } 1551 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1552 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1553 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1554 PetscFunctionReturn(0); 1555 } 1556 1557 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1558 { 1559 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1560 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1561 const PetscScalar *x,*xb; 1562 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1563 const MatScalar *v; 1564 PetscErrorCode ierr; 1565 PetscInt mbs = a->mbs,i,j,n; 1566 const PetscInt *idx,*ii,*ridx=NULL; 1567 PetscBool usecprow=a->compressedrow.use; 1568 1569 PetscFunctionBegin; 1570 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1571 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1572 1573 idx = a->j; 1574 v = a->a; 1575 if (usecprow) { 1576 if (zz != yy) { 1577 ierr = PetscArraycpy(zarray,yarray,6*mbs);CHKERRQ(ierr); 1578 } 1579 mbs = a->compressedrow.nrows; 1580 ii = a->compressedrow.i; 1581 ridx = a->compressedrow.rindex; 1582 } else { 1583 ii = a->i; 1584 y = yarray; 1585 z = zarray; 1586 } 1587 1588 for (i=0; i<mbs; i++) { 1589 n = ii[1] - ii[0]; ii++; 1590 if (usecprow) { 1591 z = zarray + 6*ridx[i]; 1592 y = yarray + 6*ridx[i]; 1593 } 1594 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1595 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1596 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1597 for (j=0; j<n; j++) { 1598 xb = x + 6*(*idx++); 1599 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1600 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1601 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1602 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1603 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1604 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1605 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1606 v += 36; 1607 } 1608 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1609 if (!usecprow) { 1610 z += 6; y += 6; 1611 } 1612 } 1613 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1614 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1615 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1616 PetscFunctionReturn(0); 1617 } 1618 1619 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1620 { 1621 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1622 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1623 const PetscScalar *x,*xb; 1624 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1625 const MatScalar *v; 1626 PetscErrorCode ierr; 1627 PetscInt mbs = a->mbs,i,j,n; 1628 const PetscInt *idx,*ii,*ridx = NULL; 1629 PetscBool usecprow=a->compressedrow.use; 1630 1631 PetscFunctionBegin; 1632 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1633 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1634 1635 idx = a->j; 1636 v = a->a; 1637 if (usecprow) { 1638 if (zz != yy) { 1639 ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr); 1640 } 1641 mbs = a->compressedrow.nrows; 1642 ii = a->compressedrow.i; 1643 ridx = a->compressedrow.rindex; 1644 } else { 1645 ii = a->i; 1646 y = yarray; 1647 z = zarray; 1648 } 1649 1650 for (i=0; i<mbs; i++) { 1651 n = ii[1] - ii[0]; ii++; 1652 if (usecprow) { 1653 z = zarray + 7*ridx[i]; 1654 y = yarray + 7*ridx[i]; 1655 } 1656 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1657 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1658 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1659 for (j=0; j<n; j++) { 1660 xb = x + 7*(*idx++); 1661 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1662 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1663 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1664 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1665 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1666 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1667 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1668 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1669 v += 49; 1670 } 1671 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1672 if (!usecprow) { 1673 z += 7; y += 7; 1674 } 1675 } 1676 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1677 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1678 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1679 PetscFunctionReturn(0); 1680 } 1681 1682 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1683 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz) 1684 { 1685 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1686 PetscScalar *z = 0,*work,*workt,*zarray; 1687 const PetscScalar *x,*xb; 1688 const MatScalar *v; 1689 PetscErrorCode ierr; 1690 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1691 PetscInt k; 1692 PetscBool usecprow=a->compressedrow.use; 1693 const PetscInt *idx,*ii,*ridx=NULL; 1694 1695 __m256d a0,a1,a2,a3,a4,a5; 1696 __m256d w0,w1,w2,w3; 1697 __m256d z0,z1,z2; 1698 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 1699 1700 PetscFunctionBegin; 1701 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1702 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1703 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1704 1705 idx = a->j; 1706 v = a->a; 1707 if (usecprow) { 1708 mbs = a->compressedrow.nrows; 1709 ii = a->compressedrow.i; 1710 ridx = a->compressedrow.rindex; 1711 } else { 1712 mbs = a->mbs; 1713 ii = a->i; 1714 z = zarray; 1715 } 1716 1717 if (!a->mult_work) { 1718 k = PetscMax(A->rmap->n,A->cmap->n); 1719 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1720 } 1721 1722 work = a->mult_work; 1723 for (i=0; i<mbs; i++) { 1724 n = ii[1] - ii[0]; ii++; 1725 workt = work; 1726 for (j=0; j<n; j++) { 1727 xb = x + bs*(*idx++); 1728 for (k=0; k<bs; k++) workt[k] = xb[k]; 1729 workt += bs; 1730 } 1731 if (usecprow) z = zarray + bs*ridx[i]; 1732 1733 z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]); 1734 1735 for (j=0; j<n; j++) { 1736 /* first column of a */ 1737 w0 = _mm256_set1_pd(work[j*9 ]); 1738 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 1739 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 1740 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 1741 1742 /* second column of a */ 1743 w1 = _mm256_set1_pd(work[j*9+ 1]); 1744 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 1745 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 1746 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 1747 1748 /* third column of a */ 1749 w2 = _mm256_set1_pd(work[j*9+ 2]); 1750 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 1751 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 1752 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 1753 1754 /* fourth column of a */ 1755 w3 = _mm256_set1_pd(work[j*9+ 3]); 1756 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 1757 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 1758 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 1759 1760 /* fifth column of a */ 1761 w0 = _mm256_set1_pd(work[j*9+ 4]); 1762 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 1763 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 1764 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 1765 1766 /* sixth column of a */ 1767 w1 = _mm256_set1_pd(work[j*9+ 5]); 1768 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 1769 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 1770 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 1771 1772 /* seventh column of a */ 1773 w2 = _mm256_set1_pd(work[j*9+ 6]); 1774 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 1775 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 1776 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 1777 1778 /* eigth column of a */ 1779 w3 = _mm256_set1_pd(work[j*9+ 7]); 1780 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 1781 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 1782 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 1783 1784 /* ninth column of a */ 1785 w0 = _mm256_set1_pd(work[j*9+ 8]); 1786 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 1787 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 1788 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 1789 } 1790 1791 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 1792 1793 v += n*bs2; 1794 if (!usecprow) z += bs; 1795 } 1796 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1797 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1798 ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr); 1799 PetscFunctionReturn(0); 1800 } 1801 #endif 1802 1803 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz) 1804 { 1805 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1806 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 1807 const PetscScalar *x,*xb; 1808 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray; 1809 const MatScalar *v; 1810 PetscErrorCode ierr; 1811 PetscInt mbs = a->mbs,i,j,n; 1812 const PetscInt *idx,*ii,*ridx = NULL; 1813 PetscBool usecprow=a->compressedrow.use; 1814 1815 PetscFunctionBegin; 1816 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1817 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1818 1819 idx = a->j; 1820 v = a->a; 1821 if (usecprow) { 1822 if (zz != yy) { 1823 ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr); 1824 } 1825 mbs = a->compressedrow.nrows; 1826 ii = a->compressedrow.i; 1827 ridx = a->compressedrow.rindex; 1828 } else { 1829 ii = a->i; 1830 y = yarray; 1831 z = zarray; 1832 } 1833 1834 for (i=0; i<mbs; i++) { 1835 n = ii[1] - ii[0]; ii++; 1836 if (usecprow) { 1837 z = zarray + 11*ridx[i]; 1838 y = yarray + 11*ridx[i]; 1839 } 1840 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1841 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; 1842 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1843 PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1844 for (j=0; j<n; j++) { 1845 xb = x + 11*(*idx++); 1846 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; 1847 sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11; 1848 sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11; 1849 sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11; 1850 sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11; 1851 sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11; 1852 sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11; 1853 sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11; 1854 sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11; 1855 sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11; 1856 sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11; 1857 sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11; 1858 v += 121; 1859 } 1860 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1861 z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10; 1862 if (!usecprow) { 1863 z += 11; y += 11; 1864 } 1865 } 1866 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1867 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1868 ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr); 1869 PetscFunctionReturn(0); 1870 } 1871 1872 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1873 { 1874 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1875 PetscScalar *z = 0,*work,*workt,*zarray; 1876 const PetscScalar *x,*xb; 1877 const MatScalar *v; 1878 PetscErrorCode ierr; 1879 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1880 PetscInt ncols,k; 1881 const PetscInt *ridx = NULL,*idx,*ii; 1882 PetscBool usecprow = a->compressedrow.use; 1883 1884 PetscFunctionBegin; 1885 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1886 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1887 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1888 1889 idx = a->j; 1890 v = a->a; 1891 if (usecprow) { 1892 mbs = a->compressedrow.nrows; 1893 ii = a->compressedrow.i; 1894 ridx = a->compressedrow.rindex; 1895 } else { 1896 mbs = a->mbs; 1897 ii = a->i; 1898 z = zarray; 1899 } 1900 1901 if (!a->mult_work) { 1902 k = PetscMax(A->rmap->n,A->cmap->n); 1903 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1904 } 1905 work = a->mult_work; 1906 for (i=0; i<mbs; i++) { 1907 n = ii[1] - ii[0]; ii++; 1908 ncols = n*bs; 1909 workt = work; 1910 for (j=0; j<n; j++) { 1911 xb = x + bs*(*idx++); 1912 for (k=0; k<bs; k++) workt[k] = xb[k]; 1913 workt += bs; 1914 } 1915 if (usecprow) z = zarray + bs*ridx[i]; 1916 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1917 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1918 v += n*bs2; 1919 if (!usecprow) z += bs; 1920 } 1921 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1922 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1923 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1924 PetscFunctionReturn(0); 1925 } 1926 1927 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1928 { 1929 PetscScalar zero = 0.0; 1930 PetscErrorCode ierr; 1931 1932 PetscFunctionBegin; 1933 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1934 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1935 PetscFunctionReturn(0); 1936 } 1937 1938 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1939 { 1940 PetscScalar zero = 0.0; 1941 PetscErrorCode ierr; 1942 1943 PetscFunctionBegin; 1944 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1945 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1946 PetscFunctionReturn(0); 1947 } 1948 1949 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1950 { 1951 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1952 PetscScalar *z,x1,x2,x3,x4,x5; 1953 const PetscScalar *x,*xb = NULL; 1954 const MatScalar *v; 1955 PetscErrorCode ierr; 1956 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 1957 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1958 Mat_CompressedRow cprow = a->compressedrow; 1959 PetscBool usecprow = cprow.use; 1960 1961 PetscFunctionBegin; 1962 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1963 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1964 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1965 1966 idx = a->j; 1967 v = a->a; 1968 if (usecprow) { 1969 mbs = cprow.nrows; 1970 ii = cprow.i; 1971 ridx = cprow.rindex; 1972 } else { 1973 mbs=a->mbs; 1974 ii = a->i; 1975 xb = x; 1976 } 1977 1978 switch (bs) { 1979 case 1: 1980 for (i=0; i<mbs; i++) { 1981 if (usecprow) xb = x + ridx[i]; 1982 x1 = xb[0]; 1983 ib = idx + ii[0]; 1984 n = ii[1] - ii[0]; ii++; 1985 for (j=0; j<n; j++) { 1986 rval = ib[j]; 1987 z[rval] += PetscConj(*v) * x1; 1988 v++; 1989 } 1990 if (!usecprow) xb++; 1991 } 1992 break; 1993 case 2: 1994 for (i=0; i<mbs; i++) { 1995 if (usecprow) xb = x + 2*ridx[i]; 1996 x1 = xb[0]; x2 = xb[1]; 1997 ib = idx + ii[0]; 1998 n = ii[1] - ii[0]; ii++; 1999 for (j=0; j<n; j++) { 2000 rval = ib[j]*2; 2001 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2002 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2003 v += 4; 2004 } 2005 if (!usecprow) xb += 2; 2006 } 2007 break; 2008 case 3: 2009 for (i=0; i<mbs; i++) { 2010 if (usecprow) xb = x + 3*ridx[i]; 2011 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2012 ib = idx + ii[0]; 2013 n = ii[1] - ii[0]; ii++; 2014 for (j=0; j<n; j++) { 2015 rval = ib[j]*3; 2016 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2017 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2018 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2019 v += 9; 2020 } 2021 if (!usecprow) xb += 3; 2022 } 2023 break; 2024 case 4: 2025 for (i=0; i<mbs; i++) { 2026 if (usecprow) xb = x + 4*ridx[i]; 2027 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2028 ib = idx + ii[0]; 2029 n = ii[1] - ii[0]; ii++; 2030 for (j=0; j<n; j++) { 2031 rval = ib[j]*4; 2032 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2033 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2034 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2035 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2036 v += 16; 2037 } 2038 if (!usecprow) xb += 4; 2039 } 2040 break; 2041 case 5: 2042 for (i=0; i<mbs; i++) { 2043 if (usecprow) xb = x + 5*ridx[i]; 2044 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2045 x4 = xb[3]; x5 = xb[4]; 2046 ib = idx + ii[0]; 2047 n = ii[1] - ii[0]; ii++; 2048 for (j=0; j<n; j++) { 2049 rval = ib[j]*5; 2050 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2051 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2052 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2053 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2054 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2055 v += 25; 2056 } 2057 if (!usecprow) xb += 5; 2058 } 2059 break; 2060 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2061 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2062 #if 0 2063 { 2064 PetscInt ncols,k,bs2=a->bs2; 2065 PetscScalar *work,*workt,zb; 2066 const PetscScalar *xtmp; 2067 if (!a->mult_work) { 2068 k = PetscMax(A->rmap->n,A->cmap->n); 2069 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2070 } 2071 work = a->mult_work; 2072 xtmp = x; 2073 for (i=0; i<mbs; i++) { 2074 n = ii[1] - ii[0]; ii++; 2075 ncols = n*bs; 2076 ierr = PetscArrayzero(work,ncols);CHKERRQ(ierr); 2077 if (usecprow) xtmp = x + bs*ridx[i]; 2078 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2079 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2080 v += n*bs2; 2081 if (!usecprow) xtmp += bs; 2082 workt = work; 2083 for (j=0; j<n; j++) { 2084 zb = z + bs*(*idx++); 2085 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2086 workt += bs; 2087 } 2088 } 2089 } 2090 #endif 2091 } 2092 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2093 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2094 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2095 PetscFunctionReturn(0); 2096 } 2097 2098 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2099 { 2100 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2101 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2102 const PetscScalar *x,*xb = 0; 2103 const MatScalar *v; 2104 PetscErrorCode ierr; 2105 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2106 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2107 Mat_CompressedRow cprow = a->compressedrow; 2108 PetscBool usecprow=cprow.use; 2109 2110 PetscFunctionBegin; 2111 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2112 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2113 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2114 2115 idx = a->j; 2116 v = a->a; 2117 if (usecprow) { 2118 mbs = cprow.nrows; 2119 ii = cprow.i; 2120 ridx = cprow.rindex; 2121 } else { 2122 mbs=a->mbs; 2123 ii = a->i; 2124 xb = x; 2125 } 2126 2127 switch (bs) { 2128 case 1: 2129 for (i=0; i<mbs; i++) { 2130 if (usecprow) xb = x + ridx[i]; 2131 x1 = xb[0]; 2132 ib = idx + ii[0]; 2133 n = ii[1] - ii[0]; ii++; 2134 for (j=0; j<n; j++) { 2135 rval = ib[j]; 2136 z[rval] += *v * x1; 2137 v++; 2138 } 2139 if (!usecprow) xb++; 2140 } 2141 break; 2142 case 2: 2143 for (i=0; i<mbs; i++) { 2144 if (usecprow) xb = x + 2*ridx[i]; 2145 x1 = xb[0]; x2 = xb[1]; 2146 ib = idx + ii[0]; 2147 n = ii[1] - ii[0]; ii++; 2148 for (j=0; j<n; j++) { 2149 rval = ib[j]*2; 2150 z[rval++] += v[0]*x1 + v[1]*x2; 2151 z[rval++] += v[2]*x1 + v[3]*x2; 2152 v += 4; 2153 } 2154 if (!usecprow) xb += 2; 2155 } 2156 break; 2157 case 3: 2158 for (i=0; i<mbs; i++) { 2159 if (usecprow) xb = x + 3*ridx[i]; 2160 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2161 ib = idx + ii[0]; 2162 n = ii[1] - ii[0]; ii++; 2163 for (j=0; j<n; j++) { 2164 rval = ib[j]*3; 2165 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2166 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2167 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2168 v += 9; 2169 } 2170 if (!usecprow) xb += 3; 2171 } 2172 break; 2173 case 4: 2174 for (i=0; i<mbs; i++) { 2175 if (usecprow) xb = x + 4*ridx[i]; 2176 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2177 ib = idx + ii[0]; 2178 n = ii[1] - ii[0]; ii++; 2179 for (j=0; j<n; j++) { 2180 rval = ib[j]*4; 2181 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2182 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2183 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2184 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2185 v += 16; 2186 } 2187 if (!usecprow) xb += 4; 2188 } 2189 break; 2190 case 5: 2191 for (i=0; i<mbs; i++) { 2192 if (usecprow) xb = x + 5*ridx[i]; 2193 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2194 x4 = xb[3]; x5 = xb[4]; 2195 ib = idx + ii[0]; 2196 n = ii[1] - ii[0]; ii++; 2197 for (j=0; j<n; j++) { 2198 rval = ib[j]*5; 2199 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2200 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2201 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2202 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2203 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2204 v += 25; 2205 } 2206 if (!usecprow) xb += 5; 2207 } 2208 break; 2209 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2210 PetscInt ncols,k; 2211 PetscScalar *work,*workt; 2212 const PetscScalar *xtmp; 2213 if (!a->mult_work) { 2214 k = PetscMax(A->rmap->n,A->cmap->n); 2215 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2216 } 2217 work = a->mult_work; 2218 xtmp = x; 2219 for (i=0; i<mbs; i++) { 2220 n = ii[1] - ii[0]; ii++; 2221 ncols = n*bs; 2222 ierr = PetscArrayzero(work,ncols);CHKERRQ(ierr); 2223 if (usecprow) xtmp = x + bs*ridx[i]; 2224 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2225 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2226 v += n*bs2; 2227 if (!usecprow) xtmp += bs; 2228 workt = work; 2229 for (j=0; j<n; j++) { 2230 zb = z + bs*(*idx++); 2231 for (k=0; k<bs; k++) zb[k] += workt[k]; 2232 workt += bs; 2233 } 2234 } 2235 } 2236 } 2237 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2238 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2239 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2240 PetscFunctionReturn(0); 2241 } 2242 2243 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2244 { 2245 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2246 PetscInt totalnz = a->bs2*a->nz; 2247 PetscScalar oalpha = alpha; 2248 PetscErrorCode ierr; 2249 PetscBLASInt one = 1,tnz; 2250 2251 PetscFunctionBegin; 2252 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 2253 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2254 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 2255 PetscFunctionReturn(0); 2256 } 2257 2258 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2259 { 2260 PetscErrorCode ierr; 2261 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2262 MatScalar *v = a->a; 2263 PetscReal sum = 0.0; 2264 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2265 2266 PetscFunctionBegin; 2267 if (type == NORM_FROBENIUS) { 2268 #if defined(PETSC_USE_REAL___FP16) 2269 PetscBLASInt one = 1,cnt = bs2*nz; 2270 *norm = BLASnrm2_(&cnt,v,&one); 2271 #else 2272 for (i=0; i<bs2*nz; i++) { 2273 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2274 } 2275 #endif 2276 *norm = PetscSqrtReal(sum); 2277 ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr); 2278 } else if (type == NORM_1) { /* maximum column sum */ 2279 PetscReal *tmp; 2280 PetscInt *bcol = a->j; 2281 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 2282 for (i=0; i<nz; i++) { 2283 for (j=0; j<bs; j++) { 2284 k1 = bs*(*bcol) + j; /* column index */ 2285 for (k=0; k<bs; k++) { 2286 tmp[k1] += PetscAbsScalar(*v); v++; 2287 } 2288 } 2289 bcol++; 2290 } 2291 *norm = 0.0; 2292 for (j=0; j<A->cmap->n; j++) { 2293 if (tmp[j] > *norm) *norm = tmp[j]; 2294 } 2295 ierr = PetscFree(tmp);CHKERRQ(ierr); 2296 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2297 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2298 *norm = 0.0; 2299 for (k=0; k<bs; k++) { 2300 for (j=0; j<a->mbs; j++) { 2301 v = a->a + bs2*a->i[j] + k; 2302 sum = 0.0; 2303 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2304 for (k1=0; k1<bs; k1++) { 2305 sum += PetscAbsScalar(*v); 2306 v += bs; 2307 } 2308 } 2309 if (sum > *norm) *norm = sum; 2310 } 2311 } 2312 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2313 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2314 PetscFunctionReturn(0); 2315 } 2316 2317 2318 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2319 { 2320 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2321 PetscErrorCode ierr; 2322 2323 PetscFunctionBegin; 2324 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2325 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2326 *flg = PETSC_FALSE; 2327 PetscFunctionReturn(0); 2328 } 2329 2330 /* if the a->i are the same */ 2331 ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr); 2332 if (!*flg) PetscFunctionReturn(0); 2333 2334 /* if a->j are the same */ 2335 ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr); 2336 if (!*flg) PetscFunctionReturn(0); 2337 2338 /* if a->a are the same */ 2339 ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);CHKERRQ(ierr); 2340 PetscFunctionReturn(0); 2341 2342 } 2343 2344 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2345 { 2346 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2347 PetscErrorCode ierr; 2348 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2349 PetscScalar *x,zero = 0.0; 2350 MatScalar *aa,*aa_j; 2351 2352 PetscFunctionBegin; 2353 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2354 bs = A->rmap->bs; 2355 aa = a->a; 2356 ai = a->i; 2357 aj = a->j; 2358 ambs = a->mbs; 2359 bs2 = a->bs2; 2360 2361 ierr = VecSet(v,zero);CHKERRQ(ierr); 2362 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2363 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2364 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2365 for (i=0; i<ambs; i++) { 2366 for (j=ai[i]; j<ai[i+1]; j++) { 2367 if (aj[j] == i) { 2368 row = i*bs; 2369 aa_j = aa+j*bs2; 2370 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2371 break; 2372 } 2373 } 2374 } 2375 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2376 PetscFunctionReturn(0); 2377 } 2378 2379 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2380 { 2381 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2382 const PetscScalar *l,*r,*li,*ri; 2383 PetscScalar x; 2384 MatScalar *aa, *v; 2385 PetscErrorCode ierr; 2386 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2387 const PetscInt *ai,*aj; 2388 2389 PetscFunctionBegin; 2390 ai = a->i; 2391 aj = a->j; 2392 aa = a->a; 2393 m = A->rmap->n; 2394 n = A->cmap->n; 2395 bs = A->rmap->bs; 2396 mbs = a->mbs; 2397 bs2 = a->bs2; 2398 if (ll) { 2399 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2400 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2401 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2402 for (i=0; i<mbs; i++) { /* for each block row */ 2403 M = ai[i+1] - ai[i]; 2404 li = l + i*bs; 2405 v = aa + bs2*ai[i]; 2406 for (j=0; j<M; j++) { /* for each block */ 2407 for (k=0; k<bs2; k++) { 2408 (*v++) *= li[k%bs]; 2409 } 2410 } 2411 } 2412 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2413 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2414 } 2415 2416 if (rr) { 2417 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2418 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2419 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2420 for (i=0; i<mbs; i++) { /* for each block row */ 2421 iai = ai[i]; 2422 M = ai[i+1] - iai; 2423 v = aa + bs2*iai; 2424 for (j=0; j<M; j++) { /* for each block */ 2425 ri = r + bs*aj[iai+j]; 2426 for (k=0; k<bs; k++) { 2427 x = ri[k]; 2428 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2429 v += bs; 2430 } 2431 } 2432 } 2433 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2434 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2435 } 2436 PetscFunctionReturn(0); 2437 } 2438 2439 2440 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2441 { 2442 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2443 2444 PetscFunctionBegin; 2445 info->block_size = a->bs2; 2446 info->nz_allocated = a->bs2*a->maxnz; 2447 info->nz_used = a->bs2*a->nz; 2448 info->nz_unneeded = info->nz_allocated - info->nz_used; 2449 info->assemblies = A->num_ass; 2450 info->mallocs = A->info.mallocs; 2451 info->memory = ((PetscObject)A)->mem; 2452 if (A->factortype) { 2453 info->fill_ratio_given = A->info.fill_ratio_given; 2454 info->fill_ratio_needed = A->info.fill_ratio_needed; 2455 info->factor_mallocs = A->info.factor_mallocs; 2456 } else { 2457 info->fill_ratio_given = 0; 2458 info->fill_ratio_needed = 0; 2459 info->factor_mallocs = 0; 2460 } 2461 PetscFunctionReturn(0); 2462 } 2463 2464 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2465 { 2466 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2467 PetscErrorCode ierr; 2468 2469 PetscFunctionBegin; 2470 ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr); 2471 PetscFunctionReturn(0); 2472 } 2473 2474 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 2475 { 2476 PetscErrorCode ierr; 2477 2478 PetscFunctionBegin; 2479 ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 2480 2481 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 2482 PetscFunctionReturn(0); 2483 } 2484 2485 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2486 { 2487 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2488 PetscScalar *z = 0,sum1; 2489 const PetscScalar *xb; 2490 PetscScalar x1; 2491 const MatScalar *v,*vv; 2492 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2493 PetscBool usecprow=a->compressedrow.use; 2494 2495 PetscFunctionBegin; 2496 idx = a->j; 2497 v = a->a; 2498 if (usecprow) { 2499 mbs = a->compressedrow.nrows; 2500 ii = a->compressedrow.i; 2501 ridx = a->compressedrow.rindex; 2502 } else { 2503 mbs = a->mbs; 2504 ii = a->i; 2505 z = c; 2506 } 2507 2508 for (i=0; i<mbs; i++) { 2509 n = ii[1] - ii[0]; ii++; 2510 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2511 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2512 if (usecprow) z = c + ridx[i]; 2513 jj = idx; 2514 vv = v; 2515 for (k=0; k<cn; k++) { 2516 idx = jj; 2517 v = vv; 2518 sum1 = 0.0; 2519 for (j=0; j<n; j++) { 2520 xb = b + (*idx++); x1 = xb[0+k*bm]; 2521 sum1 += v[0]*x1; 2522 v += 1; 2523 } 2524 z[0+k*cm] = sum1; 2525 } 2526 if (!usecprow) z += 1; 2527 } 2528 PetscFunctionReturn(0); 2529 } 2530 2531 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2532 { 2533 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2534 PetscScalar *z = 0,sum1,sum2; 2535 const PetscScalar *xb; 2536 PetscScalar x1,x2; 2537 const MatScalar *v,*vv; 2538 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2539 PetscBool usecprow=a->compressedrow.use; 2540 2541 PetscFunctionBegin; 2542 idx = a->j; 2543 v = a->a; 2544 if (usecprow) { 2545 mbs = a->compressedrow.nrows; 2546 ii = a->compressedrow.i; 2547 ridx = a->compressedrow.rindex; 2548 } else { 2549 mbs = a->mbs; 2550 ii = a->i; 2551 z = c; 2552 } 2553 2554 for (i=0; i<mbs; i++) { 2555 n = ii[1] - ii[0]; ii++; 2556 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2557 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2558 if (usecprow) z = c + 2*ridx[i]; 2559 jj = idx; 2560 vv = v; 2561 for (k=0; k<cn; k++) { 2562 idx = jj; 2563 v = vv; 2564 sum1 = 0.0; sum2 = 0.0; 2565 for (j=0; j<n; j++) { 2566 xb = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; 2567 sum1 += v[0]*x1 + v[2]*x2; 2568 sum2 += v[1]*x1 + v[3]*x2; 2569 v += 4; 2570 } 2571 z[0+k*cm] = sum1; z[1+k*cm] = sum2; 2572 } 2573 if (!usecprow) z += 2; 2574 } 2575 PetscFunctionReturn(0); 2576 } 2577 2578 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2579 { 2580 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2581 PetscScalar *z = 0,sum1,sum2,sum3; 2582 const PetscScalar *xb; 2583 PetscScalar x1,x2,x3; 2584 const MatScalar *v,*vv; 2585 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2586 PetscBool usecprow=a->compressedrow.use; 2587 2588 PetscFunctionBegin; 2589 idx = a->j; 2590 v = a->a; 2591 if (usecprow) { 2592 mbs = a->compressedrow.nrows; 2593 ii = a->compressedrow.i; 2594 ridx = a->compressedrow.rindex; 2595 } else { 2596 mbs = a->mbs; 2597 ii = a->i; 2598 z = c; 2599 } 2600 2601 for (i=0; i<mbs; i++) { 2602 n = ii[1] - ii[0]; ii++; 2603 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2604 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2605 if (usecprow) z = c + 3*ridx[i]; 2606 jj = idx; 2607 vv = v; 2608 for (k=0; k<cn; k++) { 2609 idx = jj; 2610 v = vv; 2611 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 2612 for (j=0; j<n; j++) { 2613 xb = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; 2614 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 2615 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 2616 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 2617 v += 9; 2618 } 2619 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; 2620 } 2621 if (!usecprow) z += 3; 2622 } 2623 PetscFunctionReturn(0); 2624 } 2625 2626 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2627 { 2628 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2629 PetscScalar *z = 0,sum1,sum2,sum3,sum4; 2630 const PetscScalar *xb; 2631 PetscScalar x1,x2,x3,x4; 2632 const MatScalar *v,*vv; 2633 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2634 PetscBool usecprow=a->compressedrow.use; 2635 2636 PetscFunctionBegin; 2637 idx = a->j; 2638 v = a->a; 2639 if (usecprow) { 2640 mbs = a->compressedrow.nrows; 2641 ii = a->compressedrow.i; 2642 ridx = a->compressedrow.rindex; 2643 } else { 2644 mbs = a->mbs; 2645 ii = a->i; 2646 z = c; 2647 } 2648 2649 for (i=0; i<mbs; i++) { 2650 n = ii[1] - ii[0]; ii++; 2651 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2652 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2653 if (usecprow) z = c + 4*ridx[i]; 2654 jj = idx; 2655 vv = v; 2656 for (k=0; k<cn; k++) { 2657 idx = jj; 2658 v = vv; 2659 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 2660 for (j=0; j<n; j++) { 2661 xb = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; 2662 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2663 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2664 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2665 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2666 v += 16; 2667 } 2668 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; 2669 } 2670 if (!usecprow) z += 4; 2671 } 2672 PetscFunctionReturn(0); 2673 } 2674 2675 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2676 { 2677 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2678 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5; 2679 const PetscScalar *xb; 2680 PetscScalar x1,x2,x3,x4,x5; 2681 const MatScalar *v,*vv; 2682 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2683 PetscBool usecprow=a->compressedrow.use; 2684 2685 PetscFunctionBegin; 2686 idx = a->j; 2687 v = a->a; 2688 if (usecprow) { 2689 mbs = a->compressedrow.nrows; 2690 ii = a->compressedrow.i; 2691 ridx = a->compressedrow.rindex; 2692 } else { 2693 mbs = a->mbs; 2694 ii = a->i; 2695 z = c; 2696 } 2697 2698 for (i=0; i<mbs; i++) { 2699 n = ii[1] - ii[0]; ii++; 2700 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2701 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2702 if (usecprow) z = c + 5*ridx[i]; 2703 jj = idx; 2704 vv = v; 2705 for (k=0; k<cn; k++) { 2706 idx = jj; 2707 v = vv; 2708 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 2709 for (j=0; j<n; j++) { 2710 xb = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm]; 2711 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2712 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2713 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2714 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2715 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2716 v += 25; 2717 } 2718 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5; 2719 } 2720 if (!usecprow) z += 5; 2721 } 2722 PetscFunctionReturn(0); 2723 } 2724 2725 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C) 2726 { 2727 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2728 Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 2729 Mat_SeqDense *cd = (Mat_SeqDense*)C->data; 2730 PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda; 2731 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 2732 PetscBLASInt bbs,bcn,bbm,bcm; 2733 PetscScalar *z = 0; 2734 PetscScalar *c,*b; 2735 const MatScalar *v; 2736 const PetscInt *idx,*ii,*ridx=NULL; 2737 PetscScalar _DZero=0.0,_DOne=1.0; 2738 PetscBool usecprow=a->compressedrow.use; 2739 PetscErrorCode ierr; 2740 2741 PetscFunctionBegin; 2742 if (!cm || !cn) PetscFunctionReturn(0); 2743 if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n); 2744 if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n); 2745 if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n); 2746 b = bd->v; 2747 if (a->nonzerorowcnt != A->rmap->n) { 2748 ierr = MatZeroEntries(C);CHKERRQ(ierr); 2749 } 2750 ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 2751 switch (bs) { 2752 case 1: 2753 ierr = MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn); 2754 break; 2755 case 2: 2756 ierr = MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn); 2757 break; 2758 case 3: 2759 ierr = MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn); 2760 break; 2761 case 4: 2762 ierr = MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn); 2763 break; 2764 case 5: 2765 ierr = MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn); 2766 break; 2767 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2768 ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr); 2769 ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr); 2770 ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr); 2771 ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr); 2772 idx = a->j; 2773 v = a->a; 2774 if (usecprow) { 2775 mbs = a->compressedrow.nrows; 2776 ii = a->compressedrow.i; 2777 ridx = a->compressedrow.rindex; 2778 } else { 2779 mbs = a->mbs; 2780 ii = a->i; 2781 z = c; 2782 } 2783 for (i=0; i<mbs; i++) { 2784 n = ii[1] - ii[0]; ii++; 2785 if (usecprow) z = c + bs*ridx[i]; 2786 if (n) { 2787 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm)); 2788 v += bs2; 2789 } 2790 for (j=1; j<n; j++) { 2791 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm)); 2792 v += bs2; 2793 } 2794 if (!usecprow) z += bs; 2795 } 2796 } 2797 ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 2798 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2799 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2800 ierr = PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);CHKERRQ(ierr); 2801 PetscFunctionReturn(0); 2802 } 2803