1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/impls/dense/seq/dense.h> 4 #include <petsc/private/kernels/blockinvert.h> 5 #include <petscbt.h> 6 #include <petscblaslapack.h> 7 8 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 9 #include <immintrin.h> 10 #endif 11 12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 15 PetscErrorCode ierr; 16 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 17 const PetscInt *idx; 18 PetscInt start,end,*ai,*aj,bs,*nidx2; 19 PetscBT table; 20 21 PetscFunctionBegin; 22 m = a->mbs; 23 ai = a->i; 24 aj = a->j; 25 bs = A->rmap->bs; 26 27 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 29 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 30 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 31 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 32 33 for (i=0; i<is_max; i++) { 34 /* Initialise the two local arrays */ 35 isz = 0; 36 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 37 38 /* Extract the indices, assume there can be duplicate entries */ 39 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 40 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 41 42 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 43 for (j=0; j<n; ++j) { 44 ival = idx[j]/bs; /* convert the indices into block indices */ 45 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 46 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++) { /* for each overlap*/ 53 n = isz; 54 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 55 row = nidx[k]; 56 start = ai[row]; 57 end = ai[row+1]; 58 for (l = start; l<end; l++) { 59 val = aj[l]; 60 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 61 } 62 } 63 } 64 /* expand the Index Set */ 65 for (j=0; j<isz; j++) { 66 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 67 } 68 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 69 } 70 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 71 ierr = PetscFree(nidx);CHKERRQ(ierr); 72 ierr = PetscFree(nidx2);CHKERRQ(ierr); 73 PetscFunctionReturn(0); 74 } 75 76 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 77 { 78 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 79 PetscErrorCode ierr; 80 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 81 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 82 const PetscInt *irow,*icol; 83 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 84 PetscInt *aj = a->j,*ai = a->i; 85 MatScalar *mat_a; 86 Mat C; 87 PetscBool flag; 88 89 PetscFunctionBegin; 90 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 91 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 92 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 93 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 94 95 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 96 ssmap = smap; 97 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 98 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 99 /* determine lens of each row */ 100 for (i=0; i<nrows; i++) { 101 kstart = ai[irow[i]]; 102 kend = kstart + a->ilen[irow[i]]; 103 lens[i] = 0; 104 for (k=kstart; k<kend; k++) { 105 if (ssmap[aj[k]]) lens[i]++; 106 } 107 } 108 /* Create and fill new matrix */ 109 if (scall == MAT_REUSE_MATRIX) { 110 c = (Mat_SeqBAIJ*)((*B)->data); 111 112 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 113 ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr); 114 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 115 ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr); 116 C = *B; 117 } else { 118 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 119 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 120 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 121 ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 122 } 123 c = (Mat_SeqBAIJ*)(C->data); 124 for (i=0; i<nrows; i++) { 125 row = irow[i]; 126 kstart = ai[row]; 127 kend = kstart + a->ilen[row]; 128 mat_i = c->i[i]; 129 mat_j = c->j + mat_i; 130 mat_a = c->a + mat_i*bs2; 131 mat_ilen = c->ilen + i; 132 for (k=kstart; k<kend; k++) { 133 if ((tcol=ssmap[a->j[k]])) { 134 *mat_j++ = tcol - 1; 135 ierr = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr); 136 mat_a += bs2; 137 (*mat_ilen)++; 138 } 139 } 140 } 141 /* sort */ 142 { 143 MatScalar *work; 144 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 145 for (i=0; i<nrows; i++) { 146 PetscInt ilen; 147 mat_i = c->i[i]; 148 mat_j = c->j + mat_i; 149 mat_a = c->a + mat_i*bs2; 150 ilen = c->ilen[i]; 151 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 152 } 153 ierr = PetscFree(work);CHKERRQ(ierr); 154 } 155 156 /* Free work space */ 157 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 158 ierr = PetscFree(smap);CHKERRQ(ierr); 159 ierr = PetscFree(lens);CHKERRQ(ierr); 160 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 161 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 162 163 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 164 *B = C; 165 PetscFunctionReturn(0); 166 } 167 168 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 169 { 170 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 171 IS is1,is2; 172 PetscErrorCode ierr; 173 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 174 const PetscInt *irow,*icol; 175 176 PetscFunctionBegin; 177 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 178 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 179 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 180 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 181 182 /* Verify if the indices corespond to each element in a block 183 and form the IS with compressed IS */ 184 maxmnbs = PetscMax(a->mbs,a->nbs); 185 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 186 ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr); 187 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 188 for (i=0; i<a->mbs; i++) { 189 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 190 } 191 count = 0; 192 for (i=0; i<nrows; i++) { 193 j = irow[i] / bs; 194 if ((vary[j]--)==bs) iary[count++] = j; 195 } 196 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 197 198 ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr); 199 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 200 for (i=0; i<a->nbs; i++) { 201 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 202 } 203 count = 0; 204 for (i=0; i<ncols; i++) { 205 j = icol[i] / bs; 206 if ((vary[j]--)==bs) iary[count++] = j; 207 } 208 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 209 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 210 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 211 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 212 213 ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 214 ierr = ISDestroy(&is1);CHKERRQ(ierr); 215 ierr = ISDestroy(&is2);CHKERRQ(ierr); 216 PetscFunctionReturn(0); 217 } 218 219 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 220 { 221 PetscErrorCode ierr; 222 Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data; 223 Mat_SubSppt *submatj = c->submatis1; 224 225 PetscFunctionBegin; 226 ierr = (*submatj->destroy)(C);CHKERRQ(ierr); 227 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 228 PetscFunctionReturn(0); 229 } 230 231 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[]) 232 { 233 PetscErrorCode ierr; 234 PetscInt i; 235 Mat C; 236 Mat_SeqBAIJ *c; 237 Mat_SubSppt *submatj; 238 239 PetscFunctionBegin; 240 for (i=0; i<n; i++) { 241 C = (*mat)[i]; 242 c = (Mat_SeqBAIJ*)C->data; 243 submatj = c->submatis1; 244 if (submatj) { 245 ierr = (*submatj->destroy)(C);CHKERRQ(ierr); 246 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 247 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 248 ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr); 249 ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr); 250 ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr); 251 } else { 252 ierr = MatDestroy(&C);CHKERRQ(ierr); 253 } 254 } 255 ierr = PetscFree(*mat);CHKERRQ(ierr); 256 PetscFunctionReturn(0); 257 } 258 259 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 260 { 261 PetscErrorCode ierr; 262 PetscInt i; 263 264 PetscFunctionBegin; 265 if (scall == MAT_INITIAL_MATRIX) { 266 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 267 } 268 269 for (i=0; i<n; i++) { 270 ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 271 } 272 PetscFunctionReturn(0); 273 } 274 275 /* -------------------------------------------------------*/ 276 /* Should check that shapes of vectors and matrices match */ 277 /* -------------------------------------------------------*/ 278 279 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 280 { 281 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 282 PetscScalar *z,sum; 283 const PetscScalar *x; 284 const MatScalar *v; 285 PetscErrorCode ierr; 286 PetscInt mbs,i,n; 287 const PetscInt *idx,*ii,*ridx=NULL; 288 PetscBool usecprow=a->compressedrow.use; 289 290 PetscFunctionBegin; 291 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 292 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 293 294 if (usecprow) { 295 mbs = a->compressedrow.nrows; 296 ii = a->compressedrow.i; 297 ridx = a->compressedrow.rindex; 298 ierr = PetscArrayzero(z,a->mbs);CHKERRQ(ierr); 299 } else { 300 mbs = a->mbs; 301 ii = a->i; 302 } 303 304 for (i=0; i<mbs; i++) { 305 n = ii[1] - ii[0]; 306 v = a->a + ii[0]; 307 idx = a->j + ii[0]; 308 ii++; 309 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 310 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 311 sum = 0.0; 312 PetscSparseDensePlusDot(sum,x,v,idx,n); 313 if (usecprow) { 314 z[ridx[i]] = sum; 315 } else { 316 z[i] = sum; 317 } 318 } 319 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 320 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 321 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 322 PetscFunctionReturn(0); 323 } 324 325 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 326 { 327 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 328 PetscScalar *z = 0,sum1,sum2,*zarray; 329 const PetscScalar *x,*xb; 330 PetscScalar x1,x2; 331 const MatScalar *v; 332 PetscErrorCode ierr; 333 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 334 PetscBool usecprow=a->compressedrow.use; 335 336 PetscFunctionBegin; 337 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 338 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 339 340 idx = a->j; 341 v = a->a; 342 if (usecprow) { 343 mbs = a->compressedrow.nrows; 344 ii = a->compressedrow.i; 345 ridx = a->compressedrow.rindex; 346 ierr = PetscArrayzero(zarray,2*a->mbs);CHKERRQ(ierr); 347 } else { 348 mbs = a->mbs; 349 ii = a->i; 350 z = zarray; 351 } 352 353 for (i=0; i<mbs; i++) { 354 n = ii[1] - ii[0]; ii++; 355 sum1 = 0.0; sum2 = 0.0; 356 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 357 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 358 for (j=0; j<n; j++) { 359 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 360 sum1 += v[0]*x1 + v[2]*x2; 361 sum2 += v[1]*x1 + v[3]*x2; 362 v += 4; 363 } 364 if (usecprow) z = zarray + 2*ridx[i]; 365 z[0] = sum1; z[1] = sum2; 366 if (!usecprow) z += 2; 367 } 368 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 369 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 370 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 371 PetscFunctionReturn(0); 372 } 373 374 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 375 { 376 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 377 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 378 const PetscScalar *x,*xb; 379 const MatScalar *v; 380 PetscErrorCode ierr; 381 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 382 PetscBool usecprow=a->compressedrow.use; 383 384 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 385 #pragma disjoint(*v,*z,*xb) 386 #endif 387 388 PetscFunctionBegin; 389 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 390 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 391 392 idx = a->j; 393 v = a->a; 394 if (usecprow) { 395 mbs = a->compressedrow.nrows; 396 ii = a->compressedrow.i; 397 ridx = a->compressedrow.rindex; 398 ierr = PetscArrayzero(zarray,3*a->mbs);CHKERRQ(ierr); 399 } else { 400 mbs = a->mbs; 401 ii = a->i; 402 z = zarray; 403 } 404 405 for (i=0; i<mbs; i++) { 406 n = ii[1] - ii[0]; ii++; 407 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 408 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 409 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 410 for (j=0; j<n; j++) { 411 xb = x + 3*(*idx++); 412 x1 = xb[0]; 413 x2 = xb[1]; 414 x3 = xb[2]; 415 416 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 417 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 418 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 419 v += 9; 420 } 421 if (usecprow) z = zarray + 3*ridx[i]; 422 z[0] = sum1; z[1] = sum2; z[2] = sum3; 423 if (!usecprow) z += 3; 424 } 425 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 426 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 427 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 428 PetscFunctionReturn(0); 429 } 430 431 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 432 { 433 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 434 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 435 const PetscScalar *x,*xb; 436 const MatScalar *v; 437 PetscErrorCode ierr; 438 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 439 PetscBool usecprow=a->compressedrow.use; 440 441 PetscFunctionBegin; 442 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 443 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 444 445 idx = a->j; 446 v = a->a; 447 if (usecprow) { 448 mbs = a->compressedrow.nrows; 449 ii = a->compressedrow.i; 450 ridx = a->compressedrow.rindex; 451 ierr = PetscArrayzero(zarray,4*a->mbs);CHKERRQ(ierr); 452 } else { 453 mbs = a->mbs; 454 ii = a->i; 455 z = zarray; 456 } 457 458 for (i=0; i<mbs; i++) { 459 n = ii[1] - ii[0]; 460 ii++; 461 sum1 = 0.0; 462 sum2 = 0.0; 463 sum3 = 0.0; 464 sum4 = 0.0; 465 466 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 467 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 468 for (j=0; j<n; j++) { 469 xb = x + 4*(*idx++); 470 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 471 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 472 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 473 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 474 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 475 v += 16; 476 } 477 if (usecprow) z = zarray + 4*ridx[i]; 478 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 479 if (!usecprow) z += 4; 480 } 481 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 482 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 483 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 484 PetscFunctionReturn(0); 485 } 486 487 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 488 { 489 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 490 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 491 const PetscScalar *xb,*x; 492 const MatScalar *v; 493 PetscErrorCode ierr; 494 const PetscInt *idx,*ii,*ridx=NULL; 495 PetscInt mbs,i,j,n; 496 PetscBool usecprow=a->compressedrow.use; 497 498 PetscFunctionBegin; 499 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 500 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 501 502 idx = a->j; 503 v = a->a; 504 if (usecprow) { 505 mbs = a->compressedrow.nrows; 506 ii = a->compressedrow.i; 507 ridx = a->compressedrow.rindex; 508 ierr = PetscArrayzero(zarray,5*a->mbs);CHKERRQ(ierr); 509 } else { 510 mbs = a->mbs; 511 ii = a->i; 512 z = zarray; 513 } 514 515 for (i=0; i<mbs; i++) { 516 n = ii[1] - ii[0]; ii++; 517 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 518 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 519 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 520 for (j=0; j<n; j++) { 521 xb = x + 5*(*idx++); 522 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 523 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 524 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 525 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 526 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 527 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 528 v += 25; 529 } 530 if (usecprow) z = zarray + 5*ridx[i]; 531 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 532 if (!usecprow) z += 5; 533 } 534 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 535 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 536 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 537 PetscFunctionReturn(0); 538 } 539 540 541 542 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 543 { 544 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 545 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 546 const PetscScalar *x,*xb; 547 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 548 const MatScalar *v; 549 PetscErrorCode ierr; 550 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 551 PetscBool usecprow=a->compressedrow.use; 552 553 PetscFunctionBegin; 554 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 555 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 556 557 idx = a->j; 558 v = a->a; 559 if (usecprow) { 560 mbs = a->compressedrow.nrows; 561 ii = a->compressedrow.i; 562 ridx = a->compressedrow.rindex; 563 ierr = PetscArrayzero(zarray,6*a->mbs);CHKERRQ(ierr); 564 } else { 565 mbs = a->mbs; 566 ii = a->i; 567 z = zarray; 568 } 569 570 for (i=0; i<mbs; i++) { 571 n = ii[1] - ii[0]; 572 ii++; 573 sum1 = 0.0; 574 sum2 = 0.0; 575 sum3 = 0.0; 576 sum4 = 0.0; 577 sum5 = 0.0; 578 sum6 = 0.0; 579 580 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 581 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 582 for (j=0; j<n; j++) { 583 xb = x + 6*(*idx++); 584 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 585 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 586 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 587 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 588 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 589 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 590 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 591 v += 36; 592 } 593 if (usecprow) z = zarray + 6*ridx[i]; 594 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 595 if (!usecprow) z += 6; 596 } 597 598 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 599 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 600 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 601 PetscFunctionReturn(0); 602 } 603 604 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 605 { 606 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 607 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 608 const PetscScalar *x,*xb; 609 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 610 const MatScalar *v; 611 PetscErrorCode ierr; 612 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 613 PetscBool usecprow=a->compressedrow.use; 614 615 PetscFunctionBegin; 616 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 617 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 618 619 idx = a->j; 620 v = a->a; 621 if (usecprow) { 622 mbs = a->compressedrow.nrows; 623 ii = a->compressedrow.i; 624 ridx = a->compressedrow.rindex; 625 ierr = PetscArrayzero(zarray,7*a->mbs);CHKERRQ(ierr); 626 } else { 627 mbs = a->mbs; 628 ii = a->i; 629 z = zarray; 630 } 631 632 for (i=0; i<mbs; i++) { 633 n = ii[1] - ii[0]; 634 ii++; 635 sum1 = 0.0; 636 sum2 = 0.0; 637 sum3 = 0.0; 638 sum4 = 0.0; 639 sum5 = 0.0; 640 sum6 = 0.0; 641 sum7 = 0.0; 642 643 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 644 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 645 for (j=0; j<n; j++) { 646 xb = x + 7*(*idx++); 647 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 648 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 649 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 650 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 651 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 652 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 653 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 654 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 655 v += 49; 656 } 657 if (usecprow) z = zarray + 7*ridx[i]; 658 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 659 if (!usecprow) z += 7; 660 } 661 662 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 663 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 664 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 665 PetscFunctionReturn(0); 666 } 667 668 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 669 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz) 670 { 671 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 672 PetscScalar *z = 0,*work,*workt,*zarray; 673 const PetscScalar *x,*xb; 674 const MatScalar *v; 675 PetscErrorCode ierr; 676 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 677 const PetscInt *idx,*ii,*ridx=NULL; 678 PetscInt k; 679 PetscBool usecprow=a->compressedrow.use; 680 681 __m256d a0,a1,a2,a3,a4,a5; 682 __m256d w0,w1,w2,w3; 683 __m256d z0,z1,z2; 684 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 685 686 PetscFunctionBegin; 687 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 688 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 689 690 idx = a->j; 691 v = a->a; 692 if (usecprow) { 693 mbs = a->compressedrow.nrows; 694 ii = a->compressedrow.i; 695 ridx = a->compressedrow.rindex; 696 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr); 697 } else { 698 mbs = a->mbs; 699 ii = a->i; 700 z = zarray; 701 } 702 703 if (!a->mult_work) { 704 k = PetscMax(A->rmap->n,A->cmap->n); 705 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 706 } 707 708 work = a->mult_work; 709 for (i=0; i<mbs; i++) { 710 n = ii[1] - ii[0]; ii++; 711 workt = work; 712 for (j=0; j<n; j++) { 713 xb = x + bs*(*idx++); 714 for (k=0; k<bs; k++) workt[k] = xb[k]; 715 workt += bs; 716 } 717 if (usecprow) z = zarray + bs*ridx[i]; 718 719 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 720 721 for (j=0; j<n; j++) { 722 /* first column of a */ 723 w0 = _mm256_set1_pd(work[j*9 ]); 724 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 725 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 726 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 727 728 /* second column of a */ 729 w1 = _mm256_set1_pd(work[j*9+ 1]); 730 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 731 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 732 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 733 734 /* third column of a */ 735 w2 = _mm256_set1_pd(work[j*9 +2]); 736 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 737 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 738 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 739 740 /* fourth column of a */ 741 w3 = _mm256_set1_pd(work[j*9+ 3]); 742 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 743 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 744 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 745 746 /* fifth column of a */ 747 w0 = _mm256_set1_pd(work[j*9+ 4]); 748 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 749 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 750 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 751 752 /* sixth column of a */ 753 w1 = _mm256_set1_pd(work[j*9+ 5]); 754 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 755 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 756 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 757 758 /* seventh column of a */ 759 w2 = _mm256_set1_pd(work[j*9+ 6]); 760 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 761 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 762 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 763 764 /* eigth column of a */ 765 w3 = _mm256_set1_pd(work[j*9+ 7]); 766 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 767 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 768 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 769 770 /* ninth column of a */ 771 w0 = _mm256_set1_pd(work[j*9+ 8]); 772 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 773 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 774 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 775 } 776 777 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 778 779 v += n*bs2; 780 if (!usecprow) z += bs; 781 } 782 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 783 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 784 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 785 PetscFunctionReturn(0); 786 } 787 #endif 788 789 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz) 790 { 791 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 792 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 793 const PetscScalar *x,*xb; 794 PetscScalar *zarray,xv; 795 const MatScalar *v; 796 PetscErrorCode ierr; 797 const PetscInt *ii,*ij=a->j,*idx; 798 PetscInt mbs,i,j,k,n,*ridx=NULL; 799 PetscBool usecprow=a->compressedrow.use; 800 801 PetscFunctionBegin; 802 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 803 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 804 805 v = a->a; 806 if (usecprow) { 807 mbs = a->compressedrow.nrows; 808 ii = a->compressedrow.i; 809 ridx = a->compressedrow.rindex; 810 ierr = PetscArrayzero(zarray,11*a->mbs);CHKERRQ(ierr); 811 } else { 812 mbs = a->mbs; 813 ii = a->i; 814 z = zarray; 815 } 816 817 for (i=0; i<mbs; i++) { 818 n = ii[i+1] - ii[i]; 819 idx = ij + ii[i]; 820 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 821 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; 822 823 for (j=0; j<n; j++) { 824 xb = x + 11*(idx[j]); 825 826 for (k=0; k<11; k++) { 827 xv = xb[k]; 828 sum1 += v[0]*xv; 829 sum2 += v[1]*xv; 830 sum3 += v[2]*xv; 831 sum4 += v[3]*xv; 832 sum5 += v[4]*xv; 833 sum6 += v[5]*xv; 834 sum7 += v[6]*xv; 835 sum8 += v[7]*xv; 836 sum9 += v[8]*xv; 837 sum10 += v[9]*xv; 838 sum11 += v[10]*xv; 839 v += 11; 840 } 841 } 842 if (usecprow) z = zarray + 11*ridx[i]; 843 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 844 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 845 846 if (!usecprow) z += 11; 847 } 848 849 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 850 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 851 ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr); 852 PetscFunctionReturn(0); 853 } 854 855 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 856 /* Default MatMult for block size 15 */ 857 858 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 859 { 860 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 861 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 862 const PetscScalar *x,*xb; 863 PetscScalar *zarray,xv; 864 const MatScalar *v; 865 PetscErrorCode ierr; 866 const PetscInt *ii,*ij=a->j,*idx; 867 PetscInt mbs,i,j,k,n,*ridx=NULL; 868 PetscBool usecprow=a->compressedrow.use; 869 870 PetscFunctionBegin; 871 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 872 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 873 874 v = a->a; 875 if (usecprow) { 876 mbs = a->compressedrow.nrows; 877 ii = a->compressedrow.i; 878 ridx = a->compressedrow.rindex; 879 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 880 } else { 881 mbs = a->mbs; 882 ii = a->i; 883 z = zarray; 884 } 885 886 for (i=0; i<mbs; i++) { 887 n = ii[i+1] - ii[i]; 888 idx = ij + ii[i]; 889 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 890 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 891 892 for (j=0; j<n; j++) { 893 xb = x + 15*(idx[j]); 894 895 for (k=0; k<15; k++) { 896 xv = xb[k]; 897 sum1 += v[0]*xv; 898 sum2 += v[1]*xv; 899 sum3 += v[2]*xv; 900 sum4 += v[3]*xv; 901 sum5 += v[4]*xv; 902 sum6 += v[5]*xv; 903 sum7 += v[6]*xv; 904 sum8 += v[7]*xv; 905 sum9 += v[8]*xv; 906 sum10 += v[9]*xv; 907 sum11 += v[10]*xv; 908 sum12 += v[11]*xv; 909 sum13 += v[12]*xv; 910 sum14 += v[13]*xv; 911 sum15 += v[14]*xv; 912 v += 15; 913 } 914 } 915 if (usecprow) z = zarray + 15*ridx[i]; 916 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 917 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 918 919 if (!usecprow) z += 15; 920 } 921 922 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 923 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 924 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 925 PetscFunctionReturn(0); 926 } 927 928 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 929 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 930 { 931 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 932 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 933 const PetscScalar *x,*xb; 934 PetscScalar x1,x2,x3,x4,*zarray; 935 const MatScalar *v; 936 PetscErrorCode ierr; 937 const PetscInt *ii,*ij=a->j,*idx; 938 PetscInt mbs,i,j,n,*ridx=NULL; 939 PetscBool usecprow=a->compressedrow.use; 940 941 PetscFunctionBegin; 942 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 943 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 944 945 v = a->a; 946 if (usecprow) { 947 mbs = a->compressedrow.nrows; 948 ii = a->compressedrow.i; 949 ridx = a->compressedrow.rindex; 950 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 951 } else { 952 mbs = a->mbs; 953 ii = a->i; 954 z = zarray; 955 } 956 957 for (i=0; i<mbs; i++) { 958 n = ii[i+1] - ii[i]; 959 idx = ij + ii[i]; 960 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 961 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 962 963 for (j=0; j<n; j++) { 964 xb = x + 15*(idx[j]); 965 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 966 967 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 968 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 969 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 970 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 971 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 972 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 973 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 974 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 975 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 976 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 977 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 978 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 979 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 980 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 981 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 982 983 v += 60; 984 985 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 986 987 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 988 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 989 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 990 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 991 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 992 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 993 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 994 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 995 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 996 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 997 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 998 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 999 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1000 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1001 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1002 v += 60; 1003 1004 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1005 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1006 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1007 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1008 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1009 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1010 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1011 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1012 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1013 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1014 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1015 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1016 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1017 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1018 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1019 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1020 v += 60; 1021 1022 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1023 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1024 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1025 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1026 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1027 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1028 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1029 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1030 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1031 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1032 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1033 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1034 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1035 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1036 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1037 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1038 v += 45; 1039 } 1040 if (usecprow) z = zarray + 15*ridx[i]; 1041 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1042 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1043 1044 if (!usecprow) z += 15; 1045 } 1046 1047 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1048 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1049 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1050 PetscFunctionReturn(0); 1051 } 1052 1053 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1054 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1055 { 1056 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1057 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1058 const PetscScalar *x,*xb; 1059 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1060 const MatScalar *v; 1061 PetscErrorCode ierr; 1062 const PetscInt *ii,*ij=a->j,*idx; 1063 PetscInt mbs,i,j,n,*ridx=NULL; 1064 PetscBool usecprow=a->compressedrow.use; 1065 1066 PetscFunctionBegin; 1067 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1068 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1069 1070 v = a->a; 1071 if (usecprow) { 1072 mbs = a->compressedrow.nrows; 1073 ii = a->compressedrow.i; 1074 ridx = a->compressedrow.rindex; 1075 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1076 } else { 1077 mbs = a->mbs; 1078 ii = a->i; 1079 z = zarray; 1080 } 1081 1082 for (i=0; i<mbs; i++) { 1083 n = ii[i+1] - ii[i]; 1084 idx = ij + ii[i]; 1085 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1086 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1087 1088 for (j=0; j<n; j++) { 1089 xb = x + 15*(idx[j]); 1090 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1091 x8 = xb[7]; 1092 1093 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1094 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1095 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1096 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1097 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1098 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1099 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1100 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1101 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1102 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1103 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1104 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1105 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1106 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1107 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1108 v += 120; 1109 1110 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1111 1112 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1113 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1114 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1115 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1116 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1117 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1118 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1119 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1120 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1121 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1122 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1123 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1124 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1125 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1126 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1127 v += 105; 1128 } 1129 if (usecprow) z = zarray + 15*ridx[i]; 1130 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1131 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1132 1133 if (!usecprow) z += 15; 1134 } 1135 1136 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1137 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1138 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1139 PetscFunctionReturn(0); 1140 } 1141 1142 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1143 1144 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1145 { 1146 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1147 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1148 const PetscScalar *x,*xb; 1149 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1150 const MatScalar *v; 1151 PetscErrorCode ierr; 1152 const PetscInt *ii,*ij=a->j,*idx; 1153 PetscInt mbs,i,j,n,*ridx=NULL; 1154 PetscBool usecprow=a->compressedrow.use; 1155 1156 PetscFunctionBegin; 1157 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1158 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1159 1160 v = a->a; 1161 if (usecprow) { 1162 mbs = a->compressedrow.nrows; 1163 ii = a->compressedrow.i; 1164 ridx = a->compressedrow.rindex; 1165 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr); 1166 } else { 1167 mbs = a->mbs; 1168 ii = a->i; 1169 z = zarray; 1170 } 1171 1172 for (i=0; i<mbs; i++) { 1173 n = ii[i+1] - ii[i]; 1174 idx = ij + ii[i]; 1175 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1176 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1177 1178 for (j=0; j<n; j++) { 1179 xb = x + 15*(idx[j]); 1180 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1181 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1182 1183 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1184 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1185 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1186 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1187 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1188 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1189 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1190 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1191 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1192 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1193 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1194 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1195 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1196 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1197 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1198 v += 225; 1199 } 1200 if (usecprow) z = zarray + 15*ridx[i]; 1201 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1202 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1203 1204 if (!usecprow) z += 15; 1205 } 1206 1207 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1208 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1209 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1210 PetscFunctionReturn(0); 1211 } 1212 1213 1214 /* 1215 This will not work with MatScalar == float because it calls the BLAS 1216 */ 1217 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1218 { 1219 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1220 PetscScalar *z = 0,*work,*workt,*zarray; 1221 const PetscScalar *x,*xb; 1222 const MatScalar *v; 1223 PetscErrorCode ierr; 1224 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1225 const PetscInt *idx,*ii,*ridx=NULL; 1226 PetscInt ncols,k; 1227 PetscBool usecprow=a->compressedrow.use; 1228 1229 PetscFunctionBegin; 1230 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1231 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1232 1233 idx = a->j; 1234 v = a->a; 1235 if (usecprow) { 1236 mbs = a->compressedrow.nrows; 1237 ii = a->compressedrow.i; 1238 ridx = a->compressedrow.rindex; 1239 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr); 1240 } else { 1241 mbs = a->mbs; 1242 ii = a->i; 1243 z = zarray; 1244 } 1245 1246 if (!a->mult_work) { 1247 k = PetscMax(A->rmap->n,A->cmap->n); 1248 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1249 } 1250 work = a->mult_work; 1251 for (i=0; i<mbs; i++) { 1252 n = ii[1] - ii[0]; ii++; 1253 ncols = n*bs; 1254 workt = work; 1255 for (j=0; j<n; j++) { 1256 xb = x + bs*(*idx++); 1257 for (k=0; k<bs; k++) workt[k] = xb[k]; 1258 workt += bs; 1259 } 1260 if (usecprow) z = zarray + bs*ridx[i]; 1261 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1262 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1263 v += n*bs2; 1264 if (!usecprow) z += bs; 1265 } 1266 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1267 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1268 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1269 PetscFunctionReturn(0); 1270 } 1271 1272 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1273 { 1274 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1275 const PetscScalar *x; 1276 PetscScalar *y,*z,sum; 1277 const MatScalar *v; 1278 PetscErrorCode ierr; 1279 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1280 const PetscInt *idx,*ii; 1281 PetscBool usecprow=a->compressedrow.use; 1282 1283 PetscFunctionBegin; 1284 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1285 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1286 1287 idx = a->j; 1288 v = a->a; 1289 if (usecprow) { 1290 if (zz != yy) { 1291 ierr = PetscArraycpy(z,y,mbs);CHKERRQ(ierr); 1292 } 1293 mbs = a->compressedrow.nrows; 1294 ii = a->compressedrow.i; 1295 ridx = a->compressedrow.rindex; 1296 } else { 1297 ii = a->i; 1298 } 1299 1300 for (i=0; i<mbs; i++) { 1301 n = ii[1] - ii[0]; 1302 ii++; 1303 if (!usecprow) { 1304 sum = y[i]; 1305 } else { 1306 sum = y[ridx[i]]; 1307 } 1308 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1309 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1310 PetscSparseDensePlusDot(sum,x,v,idx,n); 1311 v += n; 1312 idx += n; 1313 if (usecprow) { 1314 z[ridx[i]] = sum; 1315 } else { 1316 z[i] = sum; 1317 } 1318 } 1319 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1320 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1321 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1322 PetscFunctionReturn(0); 1323 } 1324 1325 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1326 { 1327 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1328 PetscScalar *y = 0,*z = 0,sum1,sum2; 1329 const PetscScalar *x,*xb; 1330 PetscScalar x1,x2,*yarray,*zarray; 1331 const MatScalar *v; 1332 PetscErrorCode ierr; 1333 PetscInt mbs = a->mbs,i,n,j; 1334 const PetscInt *idx,*ii,*ridx = NULL; 1335 PetscBool usecprow = a->compressedrow.use; 1336 1337 PetscFunctionBegin; 1338 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1339 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1340 1341 idx = a->j; 1342 v = a->a; 1343 if (usecprow) { 1344 if (zz != yy) { 1345 ierr = PetscArraycpy(zarray,yarray,2*mbs);CHKERRQ(ierr); 1346 } 1347 mbs = a->compressedrow.nrows; 1348 ii = a->compressedrow.i; 1349 ridx = a->compressedrow.rindex; 1350 } else { 1351 ii = a->i; 1352 y = yarray; 1353 z = zarray; 1354 } 1355 1356 for (i=0; i<mbs; i++) { 1357 n = ii[1] - ii[0]; ii++; 1358 if (usecprow) { 1359 z = zarray + 2*ridx[i]; 1360 y = yarray + 2*ridx[i]; 1361 } 1362 sum1 = y[0]; sum2 = y[1]; 1363 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1364 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1365 for (j=0; j<n; j++) { 1366 xb = x + 2*(*idx++); 1367 x1 = xb[0]; 1368 x2 = xb[1]; 1369 1370 sum1 += v[0]*x1 + v[2]*x2; 1371 sum2 += v[1]*x1 + v[3]*x2; 1372 v += 4; 1373 } 1374 z[0] = sum1; z[1] = sum2; 1375 if (!usecprow) { 1376 z += 2; y += 2; 1377 } 1378 } 1379 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1380 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1381 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1382 PetscFunctionReturn(0); 1383 } 1384 1385 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1386 { 1387 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1388 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1389 const PetscScalar *x,*xb; 1390 const MatScalar *v; 1391 PetscErrorCode ierr; 1392 PetscInt mbs = a->mbs,i,j,n; 1393 const PetscInt *idx,*ii,*ridx = NULL; 1394 PetscBool usecprow = a->compressedrow.use; 1395 1396 PetscFunctionBegin; 1397 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1398 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1399 1400 idx = a->j; 1401 v = a->a; 1402 if (usecprow) { 1403 if (zz != yy) { 1404 ierr = PetscArraycpy(zarray,yarray,3*mbs);CHKERRQ(ierr); 1405 } 1406 mbs = a->compressedrow.nrows; 1407 ii = a->compressedrow.i; 1408 ridx = a->compressedrow.rindex; 1409 } else { 1410 ii = a->i; 1411 y = yarray; 1412 z = zarray; 1413 } 1414 1415 for (i=0; i<mbs; i++) { 1416 n = ii[1] - ii[0]; ii++; 1417 if (usecprow) { 1418 z = zarray + 3*ridx[i]; 1419 y = yarray + 3*ridx[i]; 1420 } 1421 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1422 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1423 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1424 for (j=0; j<n; j++) { 1425 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1426 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1427 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1428 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1429 v += 9; 1430 } 1431 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1432 if (!usecprow) { 1433 z += 3; y += 3; 1434 } 1435 } 1436 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1437 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1438 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1439 PetscFunctionReturn(0); 1440 } 1441 1442 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1443 { 1444 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1445 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1446 const PetscScalar *x,*xb; 1447 const MatScalar *v; 1448 PetscErrorCode ierr; 1449 PetscInt mbs = a->mbs,i,j,n; 1450 const PetscInt *idx,*ii,*ridx=NULL; 1451 PetscBool usecprow=a->compressedrow.use; 1452 1453 PetscFunctionBegin; 1454 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1455 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1456 1457 idx = a->j; 1458 v = a->a; 1459 if (usecprow) { 1460 if (zz != yy) { 1461 ierr = PetscArraycpy(zarray,yarray,4*mbs);CHKERRQ(ierr); 1462 } 1463 mbs = a->compressedrow.nrows; 1464 ii = a->compressedrow.i; 1465 ridx = a->compressedrow.rindex; 1466 } else { 1467 ii = a->i; 1468 y = yarray; 1469 z = zarray; 1470 } 1471 1472 for (i=0; i<mbs; i++) { 1473 n = ii[1] - ii[0]; ii++; 1474 if (usecprow) { 1475 z = zarray + 4*ridx[i]; 1476 y = yarray + 4*ridx[i]; 1477 } 1478 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1479 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1480 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1481 for (j=0; j<n; j++) { 1482 xb = x + 4*(*idx++); 1483 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1484 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1485 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1486 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1487 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1488 v += 16; 1489 } 1490 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1491 if (!usecprow) { 1492 z += 4; y += 4; 1493 } 1494 } 1495 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1496 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1497 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1498 PetscFunctionReturn(0); 1499 } 1500 1501 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1502 { 1503 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1504 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1505 const PetscScalar *x,*xb; 1506 PetscScalar *yarray,*zarray; 1507 const MatScalar *v; 1508 PetscErrorCode ierr; 1509 PetscInt mbs = a->mbs,i,j,n; 1510 const PetscInt *idx,*ii,*ridx = NULL; 1511 PetscBool usecprow=a->compressedrow.use; 1512 1513 PetscFunctionBegin; 1514 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1515 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1516 1517 idx = a->j; 1518 v = a->a; 1519 if (usecprow) { 1520 if (zz != yy) { 1521 ierr = PetscArraycpy(zarray,yarray,5*mbs);CHKERRQ(ierr); 1522 } 1523 mbs = a->compressedrow.nrows; 1524 ii = a->compressedrow.i; 1525 ridx = a->compressedrow.rindex; 1526 } else { 1527 ii = a->i; 1528 y = yarray; 1529 z = zarray; 1530 } 1531 1532 for (i=0; i<mbs; i++) { 1533 n = ii[1] - ii[0]; ii++; 1534 if (usecprow) { 1535 z = zarray + 5*ridx[i]; 1536 y = yarray + 5*ridx[i]; 1537 } 1538 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1539 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1540 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1541 for (j=0; j<n; j++) { 1542 xb = x + 5*(*idx++); 1543 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1544 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1545 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1546 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1547 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1548 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1549 v += 25; 1550 } 1551 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1552 if (!usecprow) { 1553 z += 5; y += 5; 1554 } 1555 } 1556 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1557 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1558 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1559 PetscFunctionReturn(0); 1560 } 1561 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1562 { 1563 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1564 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1565 const PetscScalar *x,*xb; 1566 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1567 const MatScalar *v; 1568 PetscErrorCode ierr; 1569 PetscInt mbs = a->mbs,i,j,n; 1570 const PetscInt *idx,*ii,*ridx=NULL; 1571 PetscBool usecprow=a->compressedrow.use; 1572 1573 PetscFunctionBegin; 1574 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1575 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1576 1577 idx = a->j; 1578 v = a->a; 1579 if (usecprow) { 1580 if (zz != yy) { 1581 ierr = PetscArraycpy(zarray,yarray,6*mbs);CHKERRQ(ierr); 1582 } 1583 mbs = a->compressedrow.nrows; 1584 ii = a->compressedrow.i; 1585 ridx = a->compressedrow.rindex; 1586 } else { 1587 ii = a->i; 1588 y = yarray; 1589 z = zarray; 1590 } 1591 1592 for (i=0; i<mbs; i++) { 1593 n = ii[1] - ii[0]; ii++; 1594 if (usecprow) { 1595 z = zarray + 6*ridx[i]; 1596 y = yarray + 6*ridx[i]; 1597 } 1598 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1599 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1600 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1601 for (j=0; j<n; j++) { 1602 xb = x + 6*(*idx++); 1603 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1604 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1605 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1606 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1607 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1608 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1609 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1610 v += 36; 1611 } 1612 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1613 if (!usecprow) { 1614 z += 6; y += 6; 1615 } 1616 } 1617 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1618 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1619 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1620 PetscFunctionReturn(0); 1621 } 1622 1623 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1624 { 1625 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1626 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1627 const PetscScalar *x,*xb; 1628 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1629 const MatScalar *v; 1630 PetscErrorCode ierr; 1631 PetscInt mbs = a->mbs,i,j,n; 1632 const PetscInt *idx,*ii,*ridx = NULL; 1633 PetscBool usecprow=a->compressedrow.use; 1634 1635 PetscFunctionBegin; 1636 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1637 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1638 1639 idx = a->j; 1640 v = a->a; 1641 if (usecprow) { 1642 if (zz != yy) { 1643 ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr); 1644 } 1645 mbs = a->compressedrow.nrows; 1646 ii = a->compressedrow.i; 1647 ridx = a->compressedrow.rindex; 1648 } else { 1649 ii = a->i; 1650 y = yarray; 1651 z = zarray; 1652 } 1653 1654 for (i=0; i<mbs; i++) { 1655 n = ii[1] - ii[0]; ii++; 1656 if (usecprow) { 1657 z = zarray + 7*ridx[i]; 1658 y = yarray + 7*ridx[i]; 1659 } 1660 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1661 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1662 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1663 for (j=0; j<n; j++) { 1664 xb = x + 7*(*idx++); 1665 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1666 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1667 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1668 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1669 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1670 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1671 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1672 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1673 v += 49; 1674 } 1675 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1676 if (!usecprow) { 1677 z += 7; y += 7; 1678 } 1679 } 1680 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1681 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1682 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1683 PetscFunctionReturn(0); 1684 } 1685 1686 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1687 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz) 1688 { 1689 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1690 PetscScalar *z = 0,*work,*workt,*zarray; 1691 const PetscScalar *x,*xb; 1692 const MatScalar *v; 1693 PetscErrorCode ierr; 1694 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1695 PetscInt k; 1696 PetscBool usecprow=a->compressedrow.use; 1697 const PetscInt *idx,*ii,*ridx=NULL; 1698 1699 __m256d a0,a1,a2,a3,a4,a5; 1700 __m256d w0,w1,w2,w3; 1701 __m256d z0,z1,z2; 1702 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 1703 1704 PetscFunctionBegin; 1705 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1706 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1707 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1708 1709 idx = a->j; 1710 v = a->a; 1711 if (usecprow) { 1712 mbs = a->compressedrow.nrows; 1713 ii = a->compressedrow.i; 1714 ridx = a->compressedrow.rindex; 1715 } else { 1716 mbs = a->mbs; 1717 ii = a->i; 1718 z = zarray; 1719 } 1720 1721 if (!a->mult_work) { 1722 k = PetscMax(A->rmap->n,A->cmap->n); 1723 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1724 } 1725 1726 work = a->mult_work; 1727 for (i=0; i<mbs; i++) { 1728 n = ii[1] - ii[0]; ii++; 1729 workt = work; 1730 for (j=0; j<n; j++) { 1731 xb = x + bs*(*idx++); 1732 for (k=0; k<bs; k++) workt[k] = xb[k]; 1733 workt += bs; 1734 } 1735 if (usecprow) z = zarray + bs*ridx[i]; 1736 1737 z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]); 1738 1739 for (j=0; j<n; j++) { 1740 /* first column of a */ 1741 w0 = _mm256_set1_pd(work[j*9 ]); 1742 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 1743 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 1744 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 1745 1746 /* second column of a */ 1747 w1 = _mm256_set1_pd(work[j*9+ 1]); 1748 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 1749 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 1750 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 1751 1752 /* third column of a */ 1753 w2 = _mm256_set1_pd(work[j*9+ 2]); 1754 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 1755 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 1756 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 1757 1758 /* fourth column of a */ 1759 w3 = _mm256_set1_pd(work[j*9+ 3]); 1760 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 1761 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 1762 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 1763 1764 /* fifth column of a */ 1765 w0 = _mm256_set1_pd(work[j*9+ 4]); 1766 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 1767 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 1768 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 1769 1770 /* sixth column of a */ 1771 w1 = _mm256_set1_pd(work[j*9+ 5]); 1772 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 1773 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 1774 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 1775 1776 /* seventh column of a */ 1777 w2 = _mm256_set1_pd(work[j*9+ 6]); 1778 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 1779 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 1780 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 1781 1782 /* eigth column of a */ 1783 w3 = _mm256_set1_pd(work[j*9+ 7]); 1784 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 1785 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 1786 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 1787 1788 /* ninth column of a */ 1789 w0 = _mm256_set1_pd(work[j*9+ 8]); 1790 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 1791 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 1792 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 1793 } 1794 1795 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 1796 1797 v += n*bs2; 1798 if (!usecprow) z += bs; 1799 } 1800 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1801 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1802 ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr); 1803 PetscFunctionReturn(0); 1804 } 1805 #endif 1806 1807 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz) 1808 { 1809 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1810 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 1811 const PetscScalar *x,*xb; 1812 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray; 1813 const MatScalar *v; 1814 PetscErrorCode ierr; 1815 PetscInt mbs = a->mbs,i,j,n; 1816 const PetscInt *idx,*ii,*ridx = NULL; 1817 PetscBool usecprow=a->compressedrow.use; 1818 1819 PetscFunctionBegin; 1820 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1821 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1822 1823 idx = a->j; 1824 v = a->a; 1825 if (usecprow) { 1826 if (zz != yy) { 1827 ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr); 1828 } 1829 mbs = a->compressedrow.nrows; 1830 ii = a->compressedrow.i; 1831 ridx = a->compressedrow.rindex; 1832 } else { 1833 ii = a->i; 1834 y = yarray; 1835 z = zarray; 1836 } 1837 1838 for (i=0; i<mbs; i++) { 1839 n = ii[1] - ii[0]; ii++; 1840 if (usecprow) { 1841 z = zarray + 11*ridx[i]; 1842 y = yarray + 11*ridx[i]; 1843 } 1844 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1845 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; 1846 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1847 PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1848 for (j=0; j<n; j++) { 1849 xb = x + 11*(*idx++); 1850 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; 1851 sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11; 1852 sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11; 1853 sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11; 1854 sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11; 1855 sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11; 1856 sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11; 1857 sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11; 1858 sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11; 1859 sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11; 1860 sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11; 1861 sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11; 1862 v += 121; 1863 } 1864 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1865 z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10; 1866 if (!usecprow) { 1867 z += 11; y += 11; 1868 } 1869 } 1870 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1871 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1872 ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr); 1873 PetscFunctionReturn(0); 1874 } 1875 1876 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1877 { 1878 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1879 PetscScalar *z = 0,*work,*workt,*zarray; 1880 const PetscScalar *x,*xb; 1881 const MatScalar *v; 1882 PetscErrorCode ierr; 1883 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1884 PetscInt ncols,k; 1885 const PetscInt *ridx = NULL,*idx,*ii; 1886 PetscBool usecprow = a->compressedrow.use; 1887 1888 PetscFunctionBegin; 1889 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1890 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1891 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1892 1893 idx = a->j; 1894 v = a->a; 1895 if (usecprow) { 1896 mbs = a->compressedrow.nrows; 1897 ii = a->compressedrow.i; 1898 ridx = a->compressedrow.rindex; 1899 } else { 1900 mbs = a->mbs; 1901 ii = a->i; 1902 z = zarray; 1903 } 1904 1905 if (!a->mult_work) { 1906 k = PetscMax(A->rmap->n,A->cmap->n); 1907 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1908 } 1909 work = a->mult_work; 1910 for (i=0; i<mbs; i++) { 1911 n = ii[1] - ii[0]; ii++; 1912 ncols = n*bs; 1913 workt = work; 1914 for (j=0; j<n; j++) { 1915 xb = x + bs*(*idx++); 1916 for (k=0; k<bs; k++) workt[k] = xb[k]; 1917 workt += bs; 1918 } 1919 if (usecprow) z = zarray + bs*ridx[i]; 1920 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1921 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1922 v += n*bs2; 1923 if (!usecprow) z += bs; 1924 } 1925 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1926 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1927 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1928 PetscFunctionReturn(0); 1929 } 1930 1931 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1932 { 1933 PetscScalar zero = 0.0; 1934 PetscErrorCode ierr; 1935 1936 PetscFunctionBegin; 1937 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1938 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1939 PetscFunctionReturn(0); 1940 } 1941 1942 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1943 { 1944 PetscScalar zero = 0.0; 1945 PetscErrorCode ierr; 1946 1947 PetscFunctionBegin; 1948 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1949 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1950 PetscFunctionReturn(0); 1951 } 1952 1953 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1954 { 1955 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1956 PetscScalar *z,x1,x2,x3,x4,x5; 1957 const PetscScalar *x,*xb = NULL; 1958 const MatScalar *v; 1959 PetscErrorCode ierr; 1960 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 1961 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1962 Mat_CompressedRow cprow = a->compressedrow; 1963 PetscBool usecprow = cprow.use; 1964 1965 PetscFunctionBegin; 1966 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1967 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1968 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1969 1970 idx = a->j; 1971 v = a->a; 1972 if (usecprow) { 1973 mbs = cprow.nrows; 1974 ii = cprow.i; 1975 ridx = cprow.rindex; 1976 } else { 1977 mbs=a->mbs; 1978 ii = a->i; 1979 xb = x; 1980 } 1981 1982 switch (bs) { 1983 case 1: 1984 for (i=0; i<mbs; i++) { 1985 if (usecprow) xb = x + ridx[i]; 1986 x1 = xb[0]; 1987 ib = idx + ii[0]; 1988 n = ii[1] - ii[0]; ii++; 1989 for (j=0; j<n; j++) { 1990 rval = ib[j]; 1991 z[rval] += PetscConj(*v) * x1; 1992 v++; 1993 } 1994 if (!usecprow) xb++; 1995 } 1996 break; 1997 case 2: 1998 for (i=0; i<mbs; i++) { 1999 if (usecprow) xb = x + 2*ridx[i]; 2000 x1 = xb[0]; x2 = xb[1]; 2001 ib = idx + ii[0]; 2002 n = ii[1] - ii[0]; ii++; 2003 for (j=0; j<n; j++) { 2004 rval = ib[j]*2; 2005 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2006 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2007 v += 4; 2008 } 2009 if (!usecprow) xb += 2; 2010 } 2011 break; 2012 case 3: 2013 for (i=0; i<mbs; i++) { 2014 if (usecprow) xb = x + 3*ridx[i]; 2015 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2016 ib = idx + ii[0]; 2017 n = ii[1] - ii[0]; ii++; 2018 for (j=0; j<n; j++) { 2019 rval = ib[j]*3; 2020 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2021 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2022 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2023 v += 9; 2024 } 2025 if (!usecprow) xb += 3; 2026 } 2027 break; 2028 case 4: 2029 for (i=0; i<mbs; i++) { 2030 if (usecprow) xb = x + 4*ridx[i]; 2031 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2032 ib = idx + ii[0]; 2033 n = ii[1] - ii[0]; ii++; 2034 for (j=0; j<n; j++) { 2035 rval = ib[j]*4; 2036 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2037 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2038 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2039 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2040 v += 16; 2041 } 2042 if (!usecprow) xb += 4; 2043 } 2044 break; 2045 case 5: 2046 for (i=0; i<mbs; i++) { 2047 if (usecprow) xb = x + 5*ridx[i]; 2048 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2049 x4 = xb[3]; x5 = xb[4]; 2050 ib = idx + ii[0]; 2051 n = ii[1] - ii[0]; ii++; 2052 for (j=0; j<n; j++) { 2053 rval = ib[j]*5; 2054 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2055 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2056 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2057 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2058 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2059 v += 25; 2060 } 2061 if (!usecprow) xb += 5; 2062 } 2063 break; 2064 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2065 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2066 #if 0 2067 { 2068 PetscInt ncols,k,bs2=a->bs2; 2069 PetscScalar *work,*workt,zb; 2070 const PetscScalar *xtmp; 2071 if (!a->mult_work) { 2072 k = PetscMax(A->rmap->n,A->cmap->n); 2073 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2074 } 2075 work = a->mult_work; 2076 xtmp = x; 2077 for (i=0; i<mbs; i++) { 2078 n = ii[1] - ii[0]; ii++; 2079 ncols = n*bs; 2080 ierr = PetscArrayzero(work,ncols);CHKERRQ(ierr); 2081 if (usecprow) xtmp = x + bs*ridx[i]; 2082 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2083 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2084 v += n*bs2; 2085 if (!usecprow) xtmp += bs; 2086 workt = work; 2087 for (j=0; j<n; j++) { 2088 zb = z + bs*(*idx++); 2089 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2090 workt += bs; 2091 } 2092 } 2093 } 2094 #endif 2095 } 2096 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2097 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2098 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2099 PetscFunctionReturn(0); 2100 } 2101 2102 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2103 { 2104 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2105 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2106 const PetscScalar *x,*xb = 0; 2107 const MatScalar *v; 2108 PetscErrorCode ierr; 2109 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2110 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2111 Mat_CompressedRow cprow = a->compressedrow; 2112 PetscBool usecprow=cprow.use; 2113 2114 PetscFunctionBegin; 2115 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2116 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2117 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2118 2119 idx = a->j; 2120 v = a->a; 2121 if (usecprow) { 2122 mbs = cprow.nrows; 2123 ii = cprow.i; 2124 ridx = cprow.rindex; 2125 } else { 2126 mbs=a->mbs; 2127 ii = a->i; 2128 xb = x; 2129 } 2130 2131 switch (bs) { 2132 case 1: 2133 for (i=0; i<mbs; i++) { 2134 if (usecprow) xb = x + ridx[i]; 2135 x1 = xb[0]; 2136 ib = idx + ii[0]; 2137 n = ii[1] - ii[0]; ii++; 2138 for (j=0; j<n; j++) { 2139 rval = ib[j]; 2140 z[rval] += *v * x1; 2141 v++; 2142 } 2143 if (!usecprow) xb++; 2144 } 2145 break; 2146 case 2: 2147 for (i=0; i<mbs; i++) { 2148 if (usecprow) xb = x + 2*ridx[i]; 2149 x1 = xb[0]; x2 = xb[1]; 2150 ib = idx + ii[0]; 2151 n = ii[1] - ii[0]; ii++; 2152 for (j=0; j<n; j++) { 2153 rval = ib[j]*2; 2154 z[rval++] += v[0]*x1 + v[1]*x2; 2155 z[rval++] += v[2]*x1 + v[3]*x2; 2156 v += 4; 2157 } 2158 if (!usecprow) xb += 2; 2159 } 2160 break; 2161 case 3: 2162 for (i=0; i<mbs; i++) { 2163 if (usecprow) xb = x + 3*ridx[i]; 2164 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2165 ib = idx + ii[0]; 2166 n = ii[1] - ii[0]; ii++; 2167 for (j=0; j<n; j++) { 2168 rval = ib[j]*3; 2169 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2170 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2171 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2172 v += 9; 2173 } 2174 if (!usecprow) xb += 3; 2175 } 2176 break; 2177 case 4: 2178 for (i=0; i<mbs; i++) { 2179 if (usecprow) xb = x + 4*ridx[i]; 2180 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2181 ib = idx + ii[0]; 2182 n = ii[1] - ii[0]; ii++; 2183 for (j=0; j<n; j++) { 2184 rval = ib[j]*4; 2185 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2186 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2187 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2188 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2189 v += 16; 2190 } 2191 if (!usecprow) xb += 4; 2192 } 2193 break; 2194 case 5: 2195 for (i=0; i<mbs; i++) { 2196 if (usecprow) xb = x + 5*ridx[i]; 2197 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2198 x4 = xb[3]; x5 = xb[4]; 2199 ib = idx + ii[0]; 2200 n = ii[1] - ii[0]; ii++; 2201 for (j=0; j<n; j++) { 2202 rval = ib[j]*5; 2203 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2204 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2205 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2206 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2207 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2208 v += 25; 2209 } 2210 if (!usecprow) xb += 5; 2211 } 2212 break; 2213 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2214 PetscInt ncols,k; 2215 PetscScalar *work,*workt; 2216 const PetscScalar *xtmp; 2217 if (!a->mult_work) { 2218 k = PetscMax(A->rmap->n,A->cmap->n); 2219 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2220 } 2221 work = a->mult_work; 2222 xtmp = x; 2223 for (i=0; i<mbs; i++) { 2224 n = ii[1] - ii[0]; ii++; 2225 ncols = n*bs; 2226 ierr = PetscArrayzero(work,ncols);CHKERRQ(ierr); 2227 if (usecprow) xtmp = x + bs*ridx[i]; 2228 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2229 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2230 v += n*bs2; 2231 if (!usecprow) xtmp += bs; 2232 workt = work; 2233 for (j=0; j<n; j++) { 2234 zb = z + bs*(*idx++); 2235 for (k=0; k<bs; k++) zb[k] += workt[k]; 2236 workt += bs; 2237 } 2238 } 2239 } 2240 } 2241 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2242 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2243 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2244 PetscFunctionReturn(0); 2245 } 2246 2247 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2248 { 2249 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2250 PetscInt totalnz = a->bs2*a->nz; 2251 PetscScalar oalpha = alpha; 2252 PetscErrorCode ierr; 2253 PetscBLASInt one = 1,tnz; 2254 2255 PetscFunctionBegin; 2256 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 2257 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2258 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 2259 PetscFunctionReturn(0); 2260 } 2261 2262 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2263 { 2264 PetscErrorCode ierr; 2265 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2266 MatScalar *v = a->a; 2267 PetscReal sum = 0.0; 2268 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2269 2270 PetscFunctionBegin; 2271 if (type == NORM_FROBENIUS) { 2272 #if defined(PETSC_USE_REAL___FP16) 2273 PetscBLASInt one = 1,cnt = bs2*nz; 2274 *norm = BLASnrm2_(&cnt,v,&one); 2275 #else 2276 for (i=0; i<bs2*nz; i++) { 2277 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2278 } 2279 #endif 2280 *norm = PetscSqrtReal(sum); 2281 ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr); 2282 } else if (type == NORM_1) { /* maximum column sum */ 2283 PetscReal *tmp; 2284 PetscInt *bcol = a->j; 2285 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 2286 for (i=0; i<nz; i++) { 2287 for (j=0; j<bs; j++) { 2288 k1 = bs*(*bcol) + j; /* column index */ 2289 for (k=0; k<bs; k++) { 2290 tmp[k1] += PetscAbsScalar(*v); v++; 2291 } 2292 } 2293 bcol++; 2294 } 2295 *norm = 0.0; 2296 for (j=0; j<A->cmap->n; j++) { 2297 if (tmp[j] > *norm) *norm = tmp[j]; 2298 } 2299 ierr = PetscFree(tmp);CHKERRQ(ierr); 2300 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2301 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2302 *norm = 0.0; 2303 for (k=0; k<bs; k++) { 2304 for (j=0; j<a->mbs; j++) { 2305 v = a->a + bs2*a->i[j] + k; 2306 sum = 0.0; 2307 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2308 for (k1=0; k1<bs; k1++) { 2309 sum += PetscAbsScalar(*v); 2310 v += bs; 2311 } 2312 } 2313 if (sum > *norm) *norm = sum; 2314 } 2315 } 2316 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2317 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2318 PetscFunctionReturn(0); 2319 } 2320 2321 2322 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2323 { 2324 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2325 PetscErrorCode ierr; 2326 2327 PetscFunctionBegin; 2328 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2329 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2330 *flg = PETSC_FALSE; 2331 PetscFunctionReturn(0); 2332 } 2333 2334 /* if the a->i are the same */ 2335 ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr); 2336 if (!*flg) PetscFunctionReturn(0); 2337 2338 /* if a->j are the same */ 2339 ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr); 2340 if (!*flg) PetscFunctionReturn(0); 2341 2342 /* if a->a are the same */ 2343 ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);CHKERRQ(ierr); 2344 PetscFunctionReturn(0); 2345 2346 } 2347 2348 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2349 { 2350 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2351 PetscErrorCode ierr; 2352 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2353 PetscScalar *x,zero = 0.0; 2354 MatScalar *aa,*aa_j; 2355 2356 PetscFunctionBegin; 2357 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2358 bs = A->rmap->bs; 2359 aa = a->a; 2360 ai = a->i; 2361 aj = a->j; 2362 ambs = a->mbs; 2363 bs2 = a->bs2; 2364 2365 ierr = VecSet(v,zero);CHKERRQ(ierr); 2366 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2367 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2368 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2369 for (i=0; i<ambs; i++) { 2370 for (j=ai[i]; j<ai[i+1]; j++) { 2371 if (aj[j] == i) { 2372 row = i*bs; 2373 aa_j = aa+j*bs2; 2374 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2375 break; 2376 } 2377 } 2378 } 2379 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2380 PetscFunctionReturn(0); 2381 } 2382 2383 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2384 { 2385 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2386 const PetscScalar *l,*r,*li,*ri; 2387 PetscScalar x; 2388 MatScalar *aa, *v; 2389 PetscErrorCode ierr; 2390 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2391 const PetscInt *ai,*aj; 2392 2393 PetscFunctionBegin; 2394 ai = a->i; 2395 aj = a->j; 2396 aa = a->a; 2397 m = A->rmap->n; 2398 n = A->cmap->n; 2399 bs = A->rmap->bs; 2400 mbs = a->mbs; 2401 bs2 = a->bs2; 2402 if (ll) { 2403 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2404 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2405 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2406 for (i=0; i<mbs; i++) { /* for each block row */ 2407 M = ai[i+1] - ai[i]; 2408 li = l + i*bs; 2409 v = aa + bs2*ai[i]; 2410 for (j=0; j<M; j++) { /* for each block */ 2411 for (k=0; k<bs2; k++) { 2412 (*v++) *= li[k%bs]; 2413 } 2414 } 2415 } 2416 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2417 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2418 } 2419 2420 if (rr) { 2421 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2422 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2423 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2424 for (i=0; i<mbs; i++) { /* for each block row */ 2425 iai = ai[i]; 2426 M = ai[i+1] - iai; 2427 v = aa + bs2*iai; 2428 for (j=0; j<M; j++) { /* for each block */ 2429 ri = r + bs*aj[iai+j]; 2430 for (k=0; k<bs; k++) { 2431 x = ri[k]; 2432 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2433 v += bs; 2434 } 2435 } 2436 } 2437 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2438 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2439 } 2440 PetscFunctionReturn(0); 2441 } 2442 2443 2444 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2445 { 2446 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2447 2448 PetscFunctionBegin; 2449 info->block_size = a->bs2; 2450 info->nz_allocated = a->bs2*a->maxnz; 2451 info->nz_used = a->bs2*a->nz; 2452 info->nz_unneeded = info->nz_allocated - info->nz_used; 2453 info->assemblies = A->num_ass; 2454 info->mallocs = A->info.mallocs; 2455 info->memory = ((PetscObject)A)->mem; 2456 if (A->factortype) { 2457 info->fill_ratio_given = A->info.fill_ratio_given; 2458 info->fill_ratio_needed = A->info.fill_ratio_needed; 2459 info->factor_mallocs = A->info.factor_mallocs; 2460 } else { 2461 info->fill_ratio_given = 0; 2462 info->fill_ratio_needed = 0; 2463 info->factor_mallocs = 0; 2464 } 2465 PetscFunctionReturn(0); 2466 } 2467 2468 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2469 { 2470 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2471 PetscErrorCode ierr; 2472 2473 PetscFunctionBegin; 2474 ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr); 2475 PetscFunctionReturn(0); 2476 } 2477 2478 PETSC_INTERN PetscErrorCode MatMatMult_SeqBAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 2479 { 2480 PetscErrorCode ierr; 2481 2482 PetscFunctionBegin; 2483 if (scall == MAT_INITIAL_MATRIX) { 2484 ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 2485 ierr = MatMatMultSymbolic_SeqBAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr); 2486 ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 2487 } 2488 ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 2489 ierr = MatMatMultNumeric_SeqBAIJ_SeqDense(A,B,*C);CHKERRQ(ierr); 2490 ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 2491 PetscFunctionReturn(0); 2492 } 2493 2494 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C) 2495 { 2496 PetscErrorCode ierr; 2497 2498 PetscFunctionBegin; 2499 ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 2500 2501 (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 2502 PetscFunctionReturn(0); 2503 } 2504 2505 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2506 { 2507 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2508 PetscScalar *z = 0,sum1; 2509 const PetscScalar *xb; 2510 PetscScalar x1; 2511 const MatScalar *v,*vv; 2512 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2513 PetscBool usecprow=a->compressedrow.use; 2514 2515 PetscFunctionBegin; 2516 2517 idx = a->j; 2518 v = a->a; 2519 if (usecprow) { 2520 mbs = a->compressedrow.nrows; 2521 ii = a->compressedrow.i; 2522 ridx = a->compressedrow.rindex; 2523 } else { 2524 mbs = a->mbs; 2525 ii = a->i; 2526 z = c; 2527 } 2528 2529 for (i=0; i<mbs; i++) { 2530 n = ii[1] - ii[0]; ii++; 2531 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2532 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2533 if (usecprow) z = c + ridx[i]; 2534 jj = idx; 2535 vv = v; 2536 for (k=0; k<cn; k++) { 2537 idx = jj; 2538 v = vv; 2539 sum1 = 0.0; 2540 for (j=0; j<n; j++) { 2541 xb = b + (*idx++); x1 = xb[0+k*cm]; 2542 sum1 += v[0]*x1; 2543 v += 1; 2544 } 2545 z[0+k*cm] = sum1; 2546 } 2547 if (!usecprow) z += 1; 2548 } 2549 PetscFunctionReturn(0); 2550 } 2551 2552 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2553 { 2554 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2555 PetscScalar *z = 0,sum1,sum2; 2556 const PetscScalar *xb; 2557 PetscScalar x1,x2; 2558 const MatScalar *v,*vv; 2559 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2560 PetscBool usecprow=a->compressedrow.use; 2561 2562 PetscFunctionBegin; 2563 2564 idx = a->j; 2565 v = a->a; 2566 if (usecprow) { 2567 mbs = a->compressedrow.nrows; 2568 ii = a->compressedrow.i; 2569 ridx = a->compressedrow.rindex; 2570 } else { 2571 mbs = a->mbs; 2572 ii = a->i; 2573 z = c; 2574 } 2575 2576 for (i=0; i<mbs; i++) { 2577 n = ii[1] - ii[0]; ii++; 2578 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2579 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2580 if (usecprow) z = c + 2*ridx[i]; 2581 jj = idx; 2582 vv = v; 2583 for (k=0; k<cn; k++) { 2584 idx = jj; 2585 v = vv; 2586 sum1 = 0.0; sum2 = 0.0; 2587 for (j=0; j<n; j++) { 2588 xb = b + 2*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm]; 2589 sum1 += v[0]*x1 + v[2]*x2; 2590 sum2 += v[1]*x1 + v[3]*x2; 2591 v += 4; 2592 } 2593 z[0+k*cm] = sum1; z[1+k*cm] = sum2; 2594 } 2595 if (!usecprow) z += 2; 2596 } 2597 PetscFunctionReturn(0); 2598 } 2599 2600 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2601 { 2602 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2603 PetscScalar *z = 0,sum1,sum2,sum3; 2604 const PetscScalar *xb; 2605 PetscScalar x1,x2,x3; 2606 const MatScalar *v,*vv; 2607 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2608 PetscBool usecprow=a->compressedrow.use; 2609 2610 PetscFunctionBegin; 2611 2612 idx = a->j; 2613 v = a->a; 2614 if (usecprow) { 2615 mbs = a->compressedrow.nrows; 2616 ii = a->compressedrow.i; 2617 ridx = a->compressedrow.rindex; 2618 } else { 2619 mbs = a->mbs; 2620 ii = a->i; 2621 z = c; 2622 } 2623 2624 for (i=0; i<mbs; i++) { 2625 n = ii[1] - ii[0]; ii++; 2626 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2627 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2628 if (usecprow) z = c + 3*ridx[i]; 2629 jj = idx; 2630 vv = v; 2631 for (k=0; k<cn; k++) { 2632 idx = jj; 2633 v = vv; 2634 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 2635 for (j=0; j<n; j++) { 2636 xb = b + 3*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm]; x3 = xb[2+k*cm]; 2637 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 2638 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 2639 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 2640 v += 9; 2641 } 2642 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; 2643 } 2644 if (!usecprow) z += 3; 2645 } 2646 PetscFunctionReturn(0); 2647 } 2648 2649 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2650 { 2651 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2652 PetscScalar *z = 0,sum1,sum2,sum3,sum4; 2653 const PetscScalar *xb; 2654 PetscScalar x1,x2,x3,x4; 2655 const MatScalar *v,*vv; 2656 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2657 PetscBool usecprow=a->compressedrow.use; 2658 2659 PetscFunctionBegin; 2660 2661 idx = a->j; 2662 v = a->a; 2663 if (usecprow) { 2664 mbs = a->compressedrow.nrows; 2665 ii = a->compressedrow.i; 2666 ridx = a->compressedrow.rindex; 2667 } else { 2668 mbs = a->mbs; 2669 ii = a->i; 2670 z = c; 2671 } 2672 2673 for (i=0; i<mbs; i++) { 2674 n = ii[1] - ii[0]; ii++; 2675 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2676 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2677 if (usecprow) z = c + 4*ridx[i]; 2678 jj = idx; 2679 vv = v; 2680 for (k=0; k<cn; k++) { 2681 idx = jj; 2682 v = vv; 2683 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 2684 for (j=0; j<n; j++) { 2685 xb = b + 4*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm]; x3 = xb[2+k*cm]; x4 = xb[3+k*cm]; 2686 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2687 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2688 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2689 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2690 v += 16; 2691 } 2692 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; 2693 } 2694 if (!usecprow) z += 4; 2695 } 2696 PetscFunctionReturn(0); 2697 } 2698 2699 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2700 { 2701 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2702 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5; 2703 const PetscScalar *xb; 2704 PetscScalar x1,x2,x3,x4,x5; 2705 const MatScalar *v,*vv; 2706 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2707 PetscBool usecprow=a->compressedrow.use; 2708 2709 PetscFunctionBegin; 2710 2711 idx = a->j; 2712 v = a->a; 2713 if (usecprow) { 2714 mbs = a->compressedrow.nrows; 2715 ii = a->compressedrow.i; 2716 ridx = a->compressedrow.rindex; 2717 } else { 2718 mbs = a->mbs; 2719 ii = a->i; 2720 z = c; 2721 } 2722 2723 for (i=0; i<mbs; i++) { 2724 n = ii[1] - ii[0]; ii++; 2725 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2726 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2727 if (usecprow) z = c + 5*ridx[i]; 2728 jj = idx; 2729 vv = v; 2730 for (k=0; k<cn; k++) { 2731 idx = jj; 2732 v = vv; 2733 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 2734 for (j=0; j<n; j++) { 2735 xb = b + 5*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm]; x3 = xb[2+k*cm]; x4 = xb[3+k*cm]; x5 = xb[4+k*cm]; 2736 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2737 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2738 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2739 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2740 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2741 v += 25; 2742 } 2743 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5; 2744 } 2745 if (!usecprow) z += 5; 2746 } 2747 PetscFunctionReturn(0); 2748 } 2749 2750 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C) 2751 { 2752 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2753 Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 2754 Mat_SeqDense *cd = (Mat_SeqDense*)B->data; 2755 PetscErrorCode ierr; 2756 PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda; 2757 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 2758 PetscBLASInt bbs,bcn,bbm,bcm; 2759 PetscScalar *z = 0; 2760 PetscScalar *c,*b; 2761 const MatScalar *v; 2762 const PetscInt *idx,*ii,*ridx=NULL; 2763 PetscScalar _DZero=0.0,_DOne=1.0; 2764 PetscBool usecprow=a->compressedrow.use; 2765 2766 PetscFunctionBegin; 2767 if (!cm || !cn) PetscFunctionReturn(0); 2768 if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n); 2769 if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n); 2770 if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n); 2771 b = bd->v; 2772 if (a->nonzerorowcnt != A->rmap->n) { 2773 ierr = MatZeroEntries(C);CHKERRQ(ierr); 2774 } 2775 ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 2776 switch (bs) { 2777 case 1: 2778 ierr = MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn); 2779 break; 2780 case 2: 2781 ierr = MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn); 2782 break; 2783 case 3: 2784 ierr = MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn); 2785 break; 2786 case 4: 2787 ierr = MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn); 2788 break; 2789 case 5: 2790 ierr = MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn); 2791 break; 2792 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2793 ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr); 2794 ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr); 2795 ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr); 2796 ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr); 2797 idx = a->j; 2798 v = a->a; 2799 if (usecprow) { 2800 mbs = a->compressedrow.nrows; 2801 ii = a->compressedrow.i; 2802 ridx = a->compressedrow.rindex; 2803 } else { 2804 mbs = a->mbs; 2805 ii = a->i; 2806 z = c; 2807 } 2808 for (i=0; i<mbs; i++) { 2809 n = ii[1] - ii[0]; ii++; 2810 if (usecprow) z = c + bs*ridx[i]; 2811 if (n) { 2812 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm)); 2813 v += bs2; 2814 } 2815 for (j=1; j<n; j++) { 2816 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm)); 2817 v += bs2; 2818 } 2819 if (!usecprow) z += bs; 2820 } 2821 } 2822 ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 2823 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2824 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2825 ierr = PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);CHKERRQ(ierr); 2826 PetscFunctionReturn(0); 2827 } 2828